xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision e9c94282e3e09d4c4e684ccfecc4a4360e1bcc06)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
18 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
19   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
20   struct matrix_descr descr;
21 #endif
22 } Mat_SeqAIJMKL;
23 
24 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
25 
26 #undef __FUNCT__
27 #define __FUNCT__ "MatConvert_SeqAIJMKL_SeqAIJ"
28 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
29 {
30   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
31   /* so we will ignore 'MatType type'. */
32   PetscErrorCode ierr;
33   Mat            B       = *newmat;
34   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
35 
36   PetscFunctionBegin;
37   if (reuse == MAT_INITIAL_MATRIX) {
38     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
39     aijmkl = (Mat_SeqAIJMKL*)B->spptr;
40   }
41 
42   /* Reset the original function pointers. */
43   B->ops->duplicate        = MatDuplicate_SeqAIJ;
44   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
45   B->ops->destroy          = MatDestroy_SeqAIJ;
46   B->ops->mult             = MatMult_SeqAIJ;
47   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
48   B->ops->multadd          = MatMultAdd_SeqAIJ;
49   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
50 
51   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
52   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
53   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
54   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
55 
56   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
57    * simply involves destroying the MKL sparse matrix handle and then freeing
58    * the spptr pointer. */
59 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
60   if (aijmkl->sparse_optimized) {
61     sparse_status_t stat;
62     stat = mkl_sparse_destroy(aijmkl->csrA);
63     if (stat != SPARSE_STATUS_SUCCESS) {
64       PetscFunctionReturn(PETSC_ERR_LIB);
65     }
66   }
67 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
68   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
69 
70   /* Change the type of B to MATSEQAIJ. */
71   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
72 
73   *newmat = B;
74   PetscFunctionReturn(0);
75 }
76 
77 #undef __FUNCT__
78 #define __FUNCT__ "MatDestroy_SeqAIJMKL"
79 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
80 {
81   PetscErrorCode ierr;
82   Mat_SeqAIJMKL *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
83 
84   PetscFunctionBegin;
85 
86   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
87    * spptr pointer. */
88   if (aijmkl) {
89     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
90 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
91     if (aijmkl->sparse_optimized) {
92       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
93       stat = mkl_sparse_destroy(aijmkl->csrA);
94       if (stat != SPARSE_STATUS_SUCCESS) {
95         PetscFunctionReturn(PETSC_ERR_LIB);
96       }
97     }
98 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
99     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
100   }
101 
102   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
103    * to destroy everything that remains. */
104   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
105   /* Note that I don't call MatSetType().  I believe this is because that
106    * is only to be called when *building* a matrix.  I could be wrong, but
107    * that is how things work for the SuperLU matrix class. */
108   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
109   PetscFunctionReturn(0);
110 }
111 
112 #undef __FUNCT__
113 #define __FUNCT__ "MatDuplicate_SeqAIJMKL"
114 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
115 {
116   PetscErrorCode ierr;
117   Mat_SeqAIJMKL *aijmkl;
118   Mat_SeqAIJMKL *aijmkl_dest;
119 
120   PetscFunctionBegin;
121   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
122   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
123   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
124   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
125   aijmkl_dest->sparse_optimized = PETSC_FALSE;
126 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
127   aijmkl_dest->csrA = NULL;
128   if (!aijmkl->no_SpMV2) {
129     sparse_status_t stat;
130     stat = mkl_sparse_copy(aijmkl->csrA,aijmkl->descr,&aijmkl_dest->csrA);
131     stat = mkl_sparse_optimize(aijmkl_dest->csrA);
132     if (stat != SPARSE_STATUS_SUCCESS) {
133       PetscFunctionReturn(PETSC_ERR_LIB);
134     }
135     aijmkl_dest->sparse_optimized = PETSC_TRUE;
136   }
137 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
138   PetscFunctionReturn(0);
139 }
140 
141 #undef __FUNCT__
142 #define __FUNCT__ "MatAssemblyEnd_SeqAIJMKL"
143 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
144 {
145   PetscErrorCode  ierr;
146   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
147   Mat_SeqAIJMKL   *aijmkl;
148 
149   MatScalar       *aa;
150   PetscInt        n;
151   PetscInt        *aj,*ai;
152 
153   PetscFunctionBegin;
154   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
155 
156   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
157    * extra information and some different methods, call the AssemblyEnd
158    * routine for a MATSEQAIJ.
159    * I'm not sure if this is the best way to do this, but it avoids
160    * a lot of code duplication.
161    * I also note that currently MATSEQAIJMKL doesn't know anything about
162    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
163    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
164    * this, this may break things.  (Don't know... haven't looked at it.
165    * Do I need to disable this somehow?) */
166   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
167   ierr         = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
168 
169   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
170 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
171   if (!aijmkl->no_SpMV2) {
172     sparse_status_t stat;
173     if (aijmkl->sparse_optimized) {
174       /* Matrix has been previously assembled and optimized. Must destroy old
175        * matrix handle before running the optimization step again. */
176       sparse_status_t stat;
177       stat = mkl_sparse_destroy(aijmkl->csrA);
178       if (stat != SPARSE_STATUS_SUCCESS) {
179         PetscFunctionReturn(PETSC_ERR_LIB);
180       }
181     }
182     /* Now perform the SpMV2 setup and matrix optimization. */
183     aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
184     aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
185     aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
186     n = A->rmap->n;
187     aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
188     aa   = a->a;  /* Nonzero elements stored row-by-row. */
189     ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
190     stat = mkl_sparse_x_create_csr (&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,n,n,ai,ai+1,aj,aa);
191     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
192     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
193     stat = mkl_sparse_optimize(aijmkl->csrA);
194     if (stat != SPARSE_STATUS_SUCCESS) {
195       PetscFunctionReturn(PETSC_ERR_LIB);
196     }
197     aijmkl->sparse_optimized = PETSC_TRUE;
198   }
199 #endif
200 
201   PetscFunctionReturn(0);
202 }
203 
204 #undef __FUNCT__
205 #define __FUNCT__ "MatMult_SeqAIJMKL"
206 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
207 {
208   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
209   const PetscScalar *x;
210   PetscScalar       *y;
211   const MatScalar   *aa;
212   PetscErrorCode    ierr;
213   PetscInt          m=A->rmap->n;
214   const PetscInt    *aj,*ai;
215 
216   /* Variables not in MatMult_SeqAIJ. */
217   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
218 
219   PetscFunctionBegin;
220   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
221   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
222   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
223   aa   = a->a;  /* Nonzero elements stored row-by-row. */
224   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
225 
226   /* Call MKL sparse BLAS routine to do the MatMult. */
227   mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,y);
228 
229   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
230   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
231   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
232   PetscFunctionReturn(0);
233 }
234 
235 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
236 #undef __FUNCT__
237 #define __FUNCT__ "MatMult_SeqAIJMKL_SpMV2"
238 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
239 {
240   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
241   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
242   const PetscScalar *x;
243   PetscScalar       *y;
244   PetscErrorCode    ierr;
245   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
246 
247   PetscFunctionBegin;
248 
249   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
250   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
251 
252   /* Call MKL SpMV2 executor routine to do the MatMult. */
253   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
254 
255   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
256   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
257   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
258   if (stat != SPARSE_STATUS_SUCCESS) {
259     PetscFunctionReturn(PETSC_ERR_LIB);
260   }
261   PetscFunctionReturn(0);
262 }
263 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
264 
265 #undef __FUNCT__
266 #define __FUNCT__ "MatMultTranspose_SeqAIJMKL"
267 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
268 {
269   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
270   const PetscScalar *x;
271   PetscScalar       *y;
272   const MatScalar   *aa;
273   PetscErrorCode    ierr;
274   PetscInt          m=A->rmap->n;
275   const PetscInt    *aj,*ai;
276 
277   /* Variables not in MatMultTranspose_SeqAIJ. */
278   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
279 
280   PetscFunctionBegin;
281   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
282   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
283   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
284   aa   = a->a;  /* Nonzero elements stored row-by-row. */
285   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
286 
287   /* Call MKL sparse BLAS routine to do the MatMult. */
288   mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,y);
289 
290   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
291   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
292   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
293   PetscFunctionReturn(0);
294 }
295 
296 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
297 #undef __FUNCT__
298 #define __FUNCT__ "MatMultTranspose_SeqAIJMKL_SpMV2"
299 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
300 {
301   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
302   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
303   const PetscScalar *x;
304   PetscScalar       *y;
305   PetscErrorCode    ierr;
306   sparse_status_t   stat;
307 
308   PetscFunctionBegin;
309 
310   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
311   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
312 
313   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
314   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
315 
316   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
317   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
318   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
319   if (stat != SPARSE_STATUS_SUCCESS) {
320     PetscFunctionReturn(PETSC_ERR_LIB);
321   }
322   PetscFunctionReturn(0);
323 }
324 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
325 
326 #undef __FUNCT__
327 #define __FUNCT__ "MatMultAdd_SeqAIJMKL"
328 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
329 {
330   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
331   const PetscScalar *x;
332   PetscScalar       *y,*z;
333   const MatScalar   *aa;
334   PetscErrorCode    ierr;
335   PetscInt          m=A->rmap->n;
336   const PetscInt    *aj,*ai;
337   PetscInt          i;
338 
339   /* Variables not in MatMultAdd_SeqAIJ. */
340   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
341   PetscScalar       alpha = 1.0;
342   PetscScalar       beta = 1.0;
343   char              matdescra[6];
344 
345   PetscFunctionBegin;
346   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
347   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
348 
349   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
350   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
351   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
352   aa   = a->a;  /* Nonzero elements stored row-by-row. */
353   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
354 
355   /* Call MKL sparse BLAS routine to do the MatMult. */
356   if (zz == yy) {
357     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
358     mkl_xcsrmv(&transa,&m,&m,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
359   } else {
360     /* zz and yy are different vectors, so we call mkl_cspblas_xcsrgemv(), which calculates y = A*x, and then
361      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
362     mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,z);
363     for (i=0; i<m; i++) {
364       z[i] += y[i];
365     }
366   }
367 
368   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
369   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
370   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
371   PetscFunctionReturn(0);
372 }
373 
374 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
375 #undef __FUNCT__
376 #define __FUNCT__ "MatMultAdd_SeqAIJMKL_SpMV2"
377 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
378 {
379   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
380   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
381   const PetscScalar *x;
382   PetscScalar       *y,*z;
383   PetscErrorCode    ierr;
384   PetscInt          m=A->rmap->n;
385   PetscInt          i;
386 
387   /* Variables not in MatMultAdd_SeqAIJ. */
388   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
389 
390   PetscFunctionBegin;
391 
392 
393   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
394   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
395 
396   /* Call MKL sparse BLAS routine to do the MatMult. */
397   if (zz == yy) {
398     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
399      * with alpha and beta both set to 1.0. */
400     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,y);
401   } else {
402     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
403      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
404     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
405     for (i=0; i<m; i++) {
406       z[i] += y[i];
407     }
408   }
409 
410   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
411   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
412   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
413   if (stat != SPARSE_STATUS_SUCCESS) {
414     PetscFunctionReturn(PETSC_ERR_LIB);
415   }
416   PetscFunctionReturn(0);
417 }
418 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
419 
420 #undef __FUNCT__
421 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJMKL"
422 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
423 {
424   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
425   const PetscScalar *x;
426   PetscScalar       *y,*z;
427   const MatScalar   *aa;
428   PetscErrorCode    ierr;
429   PetscInt          m=A->rmap->n;
430   const PetscInt    *aj,*ai;
431   PetscInt          i;
432 
433   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
434   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
435   PetscScalar       alpha = 1.0;
436   PetscScalar       beta = 1.0;
437   char              matdescra[6];
438 
439   PetscFunctionBegin;
440   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
441   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
442 
443   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
444   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
445   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
446   aa   = a->a;  /* Nonzero elements stored row-by-row. */
447   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
448 
449   /* Call MKL sparse BLAS routine to do the MatMult. */
450   if (zz == yy) {
451     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
452     mkl_xcsrmv(&transa,&m,&m,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
453   } else {
454     /* zz and yy are different vectors, so we call mkl_cspblas_xcsrgemv(), which calculates y = A*x, and then
455      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
456     mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,z);
457     for (i=0; i<m; i++) {
458       z[i] += y[i];
459     }
460   }
461 
462   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
463   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
464   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
465   PetscFunctionReturn(0);
466 }
467 
468 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
469 #undef __FUNCT__
470 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJMKL_SpMV2"
471 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
472 {
473   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
474   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
475   const PetscScalar *x;
476   PetscScalar       *y,*z;
477   PetscErrorCode    ierr;
478   PetscInt          m=A->rmap->n;
479   PetscInt          i;
480 
481   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
482   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
483 
484   PetscFunctionBegin;
485 
486   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
487   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
488 
489   /* Call MKL sparse BLAS routine to do the MatMult. */
490   if (zz == yy) {
491     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
492      * with alpha and beta both set to 1.0. */
493     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,y);
494   } else {
495     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
496      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
497     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
498     for (i=0; i<m; i++) {
499       z[i] += y[i];
500     }
501   }
502 
503   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
504   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
505   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
506   if (stat != SPARSE_STATUS_SUCCESS) {
507     PetscFunctionReturn(PETSC_ERR_LIB);
508   }
509   PetscFunctionReturn(0);
510 }
511 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
512 
513 
514 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
515  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
516  * routine, but can also be used to convert an assembled SeqAIJ matrix
517  * into a SeqAIJMKL one. */
518 #undef __FUNCT__
519 #define __FUNCT__ "MatConvert_SeqAIJ_SeqAIJMKL"
520 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
521 {
522   PetscErrorCode ierr;
523   Mat            B = *newmat;
524   Mat_SeqAIJMKL  *aijmkl;
525   PetscBool      set;
526   PetscBool      sametype;
527 
528   PetscFunctionBegin;
529   if (reuse == MAT_INITIAL_MATRIX) {
530     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
531   }
532 
533   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
534   if (sametype) PetscFunctionReturn(0);
535 
536   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
537   B->spptr = (void*) aijmkl;
538 
539   /* Set function pointers for methods that we inherit from AIJ but override.
540    * We also parse some command line options below, since those determine some of the methods we point to.
541    * Note: Currently the transposed operations are not being set because I encounter memory corruption
542    * when these are enabled.  Need to look at this with Valgrind or similar. --RTM */
543   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
544   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
545   B->ops->destroy          = MatDestroy_SeqAIJMKL;
546 
547   aijmkl->sparse_optimized = PETSC_FALSE;
548 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
549   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
550 #elif
551   aijmkl->no_SpMV2 = PETSC_TRUE;
552 #endif
553 
554   /* Parse command line options. */
555   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
556   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
557   ierr = PetscOptionsEnd();CHKERRQ(ierr);
558 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
559   if(!aijmkl->no_SpMV2) {
560     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
561     aijmkl->no_SpMV2 = PETSC_TRUE;
562   }
563 #endif
564 
565   if(!aijmkl->no_SpMV2) {
566 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
567     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
568     /* B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2; */
569     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
570     /* B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2; */
571 #endif
572   } else {
573     B->ops->mult             = MatMult_SeqAIJMKL;
574     /* B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL; */
575     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
576     /* B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL; */
577   }
578 
579   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
580   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
581   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
582   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
583 
584   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
585   *newmat = B;
586   PetscFunctionReturn(0);
587 }
588 
589 #undef __FUNCT__
590 #define __FUNCT__ "MatCreateSeqAIJMKL"
591 /*@C
592    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
593    This type inherits from AIJ and is largely identical, but uses sparse BLAS
594    routines from Intel MKL whenever possible.
595    Collective on MPI_Comm
596 
597    Input Parameters:
598 +  comm - MPI communicator, set to PETSC_COMM_SELF
599 .  m - number of rows
600 .  n - number of columns
601 .  nz - number of nonzeros per row (same for all rows)
602 -  nnz - array containing the number of nonzeros in the various rows
603          (possibly different for each row) or NULL
604 
605    Output Parameter:
606 .  A - the matrix
607 
608    Notes:
609    If nnz is given then nz is ignored
610 
611    Level: intermediate
612 
613 .keywords: matrix, cray, sparse, parallel
614 
615 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
616 @*/
617 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
618 {
619   PetscErrorCode ierr;
620 
621   PetscFunctionBegin;
622   ierr = MatCreate(comm,A);CHKERRQ(ierr);
623   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
624   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
625   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
626   PetscFunctionReturn(0);
627 }
628 
629 #undef __FUNCT__
630 #define __FUNCT__ "MatCreate_SeqAIJMKL"
631 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
632 {
633   PetscErrorCode ierr;
634 
635   PetscFunctionBegin;
636   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
637   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
638   PetscFunctionReturn(0);
639 }
640