xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision f36dfe3fc09dfa4a7c66e6eff9e8bb3de2dfbb56)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
18 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
19   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
20   struct matrix_descr descr;
21 #endif
22 } Mat_SeqAIJMKL;
23 
24 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
25 
26 #undef __FUNCT__
27 #define __FUNCT__ "MatConvert_SeqAIJMKL_SeqAIJ"
28 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
29 {
30   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
31   /* so we will ignore 'MatType type'. */
32   PetscErrorCode ierr;
33   Mat            B       = *newmat;
34   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
35 
36   PetscFunctionBegin;
37   if (reuse == MAT_INITIAL_MATRIX) {
38     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
39     aijmkl = (Mat_SeqAIJMKL*)B->spptr;
40   }
41 
42   /* Reset the original function pointers. */
43   B->ops->duplicate        = MatDuplicate_SeqAIJ;
44   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
45   B->ops->destroy          = MatDestroy_SeqAIJ;
46   B->ops->mult             = MatMult_SeqAIJ;
47   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
48   B->ops->multadd          = MatMultAdd_SeqAIJ;
49   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
50 
51   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
52   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
53   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
54   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
55 
56   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
57    * simply involves destroying the MKL sparse matrix handle and then freeing
58    * the spptr pointer. */
59 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
60   if (aijmkl->sparse_optimized) {
61     sparse_status_t stat;
62     stat = mkl_sparse_destroy(aijmkl->csrA);
63     if (stat != SPARSE_STATUS_SUCCESS) {
64       PetscFunctionReturn(PETSC_ERR_LIB);
65     }
66   }
67 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
68   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
69 
70   /* Change the type of B to MATSEQAIJ. */
71   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
72 
73   *newmat = B;
74   PetscFunctionReturn(0);
75 }
76 
77 #undef __FUNCT__
78 #define __FUNCT__ "MatDestroy_SeqAIJMKL"
79 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
80 {
81   PetscErrorCode ierr;
82   Mat_SeqAIJMKL *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
83 
84   PetscFunctionBegin;
85 
86   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
87    * spptr pointer. */
88   if (aijmkl) {
89     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
90 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
91     if (aijmkl->sparse_optimized) {
92       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
93       stat = mkl_sparse_destroy(aijmkl->csrA);
94       if (stat != SPARSE_STATUS_SUCCESS) {
95         PetscFunctionReturn(PETSC_ERR_LIB);
96       }
97     }
98 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
99     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
100   }
101 
102   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
103    * to destroy everything that remains. */
104   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
105   /* Note that I don't call MatSetType().  I believe this is because that
106    * is only to be called when *building* a matrix.  I could be wrong, but
107    * that is how things work for the SuperLU matrix class. */
108   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
109   PetscFunctionReturn(0);
110 }
111 
112 #undef __FUNCT__
113 #define __FUNCT__ "MatDuplicate_SeqAIJMKL"
114 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
115 {
116   PetscErrorCode ierr;
117   Mat_SeqAIJMKL *aijmkl;
118   Mat_SeqAIJMKL *aijmkl_dest;
119 
120   PetscFunctionBegin;
121   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
122   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
123   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
124   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
125   aijmkl_dest->sparse_optimized = PETSC_FALSE;
126 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
127   aijmkl_dest->csrA = NULL;
128   if (!aijmkl->no_SpMV2 && A->cmap->n > 0) {
129     sparse_status_t stat;
130     stat = mkl_sparse_copy(aijmkl->csrA,aijmkl->descr,&aijmkl_dest->csrA);
131     if (stat != SPARSE_STATUS_SUCCESS) {
132       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_copy");
133       PetscFunctionReturn(PETSC_ERR_LIB);
134     }
135     stat = mkl_sparse_optimize(aijmkl_dest->csrA);
136     if (stat != SPARSE_STATUS_SUCCESS) {
137       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_optimize");
138       PetscFunctionReturn(PETSC_ERR_LIB);
139     }
140     aijmkl_dest->sparse_optimized = PETSC_TRUE;
141   }
142 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
143   PetscFunctionReturn(0);
144 }
145 
146 #undef __FUNCT__
147 #define __FUNCT__ "MatAssemblyEnd_SeqAIJMKL"
148 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
149 {
150   PetscErrorCode  ierr;
151   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
152   Mat_SeqAIJMKL   *aijmkl;
153 
154   MatScalar       *aa;
155   PetscInt        m,n;
156   PetscInt        *aj,*ai;
157 
158   PetscFunctionBegin;
159   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
160 
161   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
162    * extra information and some different methods, call the AssemblyEnd
163    * routine for a MATSEQAIJ.
164    * I'm not sure if this is the best way to do this, but it avoids
165    * a lot of code duplication.
166    * I also note that currently MATSEQAIJMKL doesn't know anything about
167    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
168    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
169    * this, this may break things.  (Don't know... haven't looked at it.
170    * Do I need to disable this somehow?) */
171   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
172   ierr         = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
173 
174   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
175 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
176   if (!aijmkl->no_SpMV2) {
177     sparse_status_t stat;
178     if (aijmkl->sparse_optimized) {
179       /* Matrix has been previously assembled and optimized. Must destroy old
180        * matrix handle before running the optimization step again. */
181       sparse_status_t stat;
182       stat = mkl_sparse_destroy(aijmkl->csrA);
183       if (stat != SPARSE_STATUS_SUCCESS) {
184         PetscFunctionReturn(PETSC_ERR_LIB);
185       }
186     }
187     /* Now perform the SpMV2 setup and matrix optimization. */
188     aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
189     aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
190     aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
191     m = A->rmap->n;
192     n = A->cmap->n;
193     aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
194     aa   = a->a;  /* Nonzero elements stored row-by-row. */
195     ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
196     if (n>0) {
197       stat = mkl_sparse_x_create_csr (&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
198       stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
199       stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
200       stat = mkl_sparse_optimize(aijmkl->csrA);
201       if (stat != SPARSE_STATUS_SUCCESS) {
202         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle/complete mkl_sparse_optimize");
203         PetscFunctionReturn(PETSC_ERR_LIB);
204       }
205       aijmkl->sparse_optimized = PETSC_TRUE;
206     }
207   }
208 #endif
209 
210   PetscFunctionReturn(0);
211 }
212 
213 #undef __FUNCT__
214 #define __FUNCT__ "MatMult_SeqAIJMKL"
215 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
216 {
217   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
218   const PetscScalar *x;
219   PetscScalar       *y;
220   const MatScalar   *aa;
221   PetscErrorCode    ierr;
222   PetscInt          m=A->rmap->n;
223   const PetscInt    *aj,*ai;
224 
225   /* Variables not in MatMult_SeqAIJ. */
226   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
227 
228   PetscFunctionBegin;
229   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
230   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
231   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
232   aa   = a->a;  /* Nonzero elements stored row-by-row. */
233   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
234 
235   /* Call MKL sparse BLAS routine to do the MatMult. */
236   mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,y);
237 
238   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
239   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
240   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
241   PetscFunctionReturn(0);
242 }
243 
244 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
245 #undef __FUNCT__
246 #define __FUNCT__ "MatMult_SeqAIJMKL_SpMV2"
247 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
248 {
249   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
250   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
251   const PetscScalar *x;
252   PetscScalar       *y;
253   PetscErrorCode    ierr;
254   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
255 
256   PetscFunctionBegin;
257 
258   /* If there are no rows, this is a no-op: return immediately. */
259   if(A->cmap->n < 1) PetscFunctionReturn(0);
260 
261   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
262   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
263 
264   /* Call MKL SpMV2 executor routine to do the MatMult. */
265   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
266 
267   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
268   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
269   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
270   if (stat != SPARSE_STATUS_SUCCESS) {
271     PetscFunctionReturn(PETSC_ERR_LIB);
272   }
273   PetscFunctionReturn(0);
274 }
275 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
276 
277 #undef __FUNCT__
278 #define __FUNCT__ "MatMultTranspose_SeqAIJMKL"
279 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
280 {
281   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
282   const PetscScalar *x;
283   PetscScalar       *y;
284   const MatScalar   *aa;
285   PetscErrorCode    ierr;
286   PetscInt          m=A->rmap->n;
287   const PetscInt    *aj,*ai;
288 
289   /* Variables not in MatMultTranspose_SeqAIJ. */
290   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
291 
292   PetscFunctionBegin;
293   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
294   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
295   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
296   aa   = a->a;  /* Nonzero elements stored row-by-row. */
297   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
298 
299   /* Call MKL sparse BLAS routine to do the MatMult. */
300   mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,y);
301 
302   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
303   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
304   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
305   PetscFunctionReturn(0);
306 }
307 
308 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
309 #undef __FUNCT__
310 #define __FUNCT__ "MatMultTranspose_SeqAIJMKL_SpMV2"
311 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
312 {
313   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
314   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
315   const PetscScalar *x;
316   PetscScalar       *y;
317   PetscErrorCode    ierr;
318   sparse_status_t   stat;
319 
320   PetscFunctionBegin;
321 
322   /* If there are no rows, this is a no-op: return immediately. */
323   if(A->cmap->n < 1) PetscFunctionReturn(0);
324 
325   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
326   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
327 
328   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
329   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
330 
331   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
332   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
333   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
334   if (stat != SPARSE_STATUS_SUCCESS) {
335     PetscFunctionReturn(PETSC_ERR_LIB);
336   }
337   PetscFunctionReturn(0);
338 }
339 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
340 
341 #undef __FUNCT__
342 #define __FUNCT__ "MatMultAdd_SeqAIJMKL"
343 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
344 {
345   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
346   const PetscScalar *x;
347   PetscScalar       *y,*z;
348   const MatScalar   *aa;
349   PetscErrorCode    ierr;
350   PetscInt          m=A->rmap->n;
351   const PetscInt    *aj,*ai;
352   PetscInt          i;
353 
354   /* Variables not in MatMultAdd_SeqAIJ. */
355   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
356   PetscScalar       alpha = 1.0;
357   PetscScalar       beta = 1.0;
358   char              matdescra[6];
359 
360   PetscFunctionBegin;
361   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
362   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
363 
364   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
365   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
366   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
367   aa   = a->a;  /* Nonzero elements stored row-by-row. */
368   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
369 
370   /* Call MKL sparse BLAS routine to do the MatMult. */
371   if (zz == yy) {
372     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
373     mkl_xcsrmv(&transa,&m,&m,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
374   } else {
375     /* zz and yy are different vectors, so we call mkl_cspblas_xcsrgemv(), which calculates y = A*x, and then
376      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
377     mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,z);
378     for (i=0; i<m; i++) {
379       z[i] += y[i];
380     }
381   }
382 
383   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
384   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
385   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
386   PetscFunctionReturn(0);
387 }
388 
389 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
390 #undef __FUNCT__
391 #define __FUNCT__ "MatMultAdd_SeqAIJMKL_SpMV2"
392 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
393 {
394   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
395   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
396   const PetscScalar *x;
397   PetscScalar       *y,*z;
398   PetscErrorCode    ierr;
399   PetscInt          m=A->rmap->n;
400   PetscInt          i;
401 
402   /* Variables not in MatMultAdd_SeqAIJ. */
403   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
404 
405   PetscFunctionBegin;
406 
407   /* If there are no rows, this is a no-op: return immediately. */
408   if(A->cmap->n < 1) PetscFunctionReturn(0);
409 
410   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
411   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
412 
413   /* Call MKL sparse BLAS routine to do the MatMult. */
414   if (zz == yy) {
415     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
416      * with alpha and beta both set to 1.0. */
417     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,y);
418   } else {
419     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
420      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
421     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
422     for (i=0; i<m; i++) {
423       z[i] += y[i];
424     }
425   }
426 
427   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
428   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
429   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
430   if (stat != SPARSE_STATUS_SUCCESS) {
431     PetscFunctionReturn(PETSC_ERR_LIB);
432   }
433   PetscFunctionReturn(0);
434 }
435 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
436 
437 #undef __FUNCT__
438 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJMKL"
439 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
440 {
441   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
442   const PetscScalar *x;
443   PetscScalar       *y,*z;
444   const MatScalar   *aa;
445   PetscErrorCode    ierr;
446   PetscInt          m=A->rmap->n;
447   const PetscInt    *aj,*ai;
448   PetscInt          i;
449 
450   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
451   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
452   PetscScalar       alpha = 1.0;
453   PetscScalar       beta = 1.0;
454   char              matdescra[6];
455 
456   PetscFunctionBegin;
457   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
458   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
459 
460   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
461   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
462   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
463   aa   = a->a;  /* Nonzero elements stored row-by-row. */
464   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
465 
466   /* Call MKL sparse BLAS routine to do the MatMult. */
467   if (zz == yy) {
468     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
469     mkl_xcsrmv(&transa,&m,&m,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
470   } else {
471     /* zz and yy are different vectors, so we call mkl_cspblas_xcsrgemv(), which calculates y = A*x, and then
472      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
473     mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,z);
474     for (i=0; i<m; i++) {
475       z[i] += y[i];
476     }
477   }
478 
479   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
480   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
481   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
482   PetscFunctionReturn(0);
483 }
484 
485 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
486 #undef __FUNCT__
487 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJMKL_SpMV2"
488 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
489 {
490   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
491   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
492   const PetscScalar *x;
493   PetscScalar       *y,*z;
494   PetscErrorCode    ierr;
495   PetscInt          m=A->rmap->n;
496   PetscInt          i;
497 
498   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
499   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
500 
501   PetscFunctionBegin;
502 
503   /* If there are no rows, this is a no-op: return immediately. */
504   if(A->cmap->n < 1) PetscFunctionReturn(0);
505 
506   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
507   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
508 
509   /* Call MKL sparse BLAS routine to do the MatMult. */
510   if (zz == yy) {
511     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
512      * with alpha and beta both set to 1.0. */
513     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,y);
514   } else {
515     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
516      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
517     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
518     for (i=0; i<m; i++) {
519       z[i] += y[i];
520     }
521   }
522 
523   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
524   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
525   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
526   if (stat != SPARSE_STATUS_SUCCESS) {
527     PetscFunctionReturn(PETSC_ERR_LIB);
528   }
529   PetscFunctionReturn(0);
530 }
531 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
532 
533 
534 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
535  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
536  * routine, but can also be used to convert an assembled SeqAIJ matrix
537  * into a SeqAIJMKL one. */
538 #undef __FUNCT__
539 #define __FUNCT__ "MatConvert_SeqAIJ_SeqAIJMKL"
540 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
541 {
542   PetscErrorCode ierr;
543   Mat            B = *newmat;
544   Mat_SeqAIJMKL  *aijmkl;
545   PetscBool      set;
546   PetscBool      sametype;
547 
548   PetscFunctionBegin;
549   if (reuse == MAT_INITIAL_MATRIX) {
550     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
551   }
552 
553   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
554   if (sametype) PetscFunctionReturn(0);
555 
556   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
557   B->spptr = (void*) aijmkl;
558 
559   /* Set function pointers for methods that we inherit from AIJ but override.
560    * We also parse some command line options below, since those determine some of the methods we point to.
561    * Note: Currently the transposed operations are not being set because I encounter memory corruption
562    * when these are enabled.  Need to look at this with Valgrind or similar. --RTM */
563   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
564   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
565   B->ops->destroy          = MatDestroy_SeqAIJMKL;
566 
567   aijmkl->sparse_optimized = PETSC_FALSE;
568 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
569   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
570 #elif
571   aijmkl->no_SpMV2 = PETSC_TRUE;
572 #endif
573 
574   /* Parse command line options. */
575   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
576   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
577   ierr = PetscOptionsEnd();CHKERRQ(ierr);
578 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
579   if(!aijmkl->no_SpMV2) {
580     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
581     aijmkl->no_SpMV2 = PETSC_TRUE;
582   }
583 #endif
584 
585   if(!aijmkl->no_SpMV2) {
586 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
587     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
588     /* B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2; */
589     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
590     /* B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2; */
591 #endif
592   } else {
593     B->ops->mult             = MatMult_SeqAIJMKL;
594     /* B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL; */
595     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
596     /* B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL; */
597   }
598 
599   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
600   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
601   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
602   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
603 
604   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
605   *newmat = B;
606   PetscFunctionReturn(0);
607 }
608 
609 #undef __FUNCT__
610 #define __FUNCT__ "MatCreateSeqAIJMKL"
611 /*@C
612    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
613    This type inherits from AIJ and is largely identical, but uses sparse BLAS
614    routines from Intel MKL whenever possible.
615    Collective on MPI_Comm
616 
617    Input Parameters:
618 +  comm - MPI communicator, set to PETSC_COMM_SELF
619 .  m - number of rows
620 .  n - number of columns
621 .  nz - number of nonzeros per row (same for all rows)
622 -  nnz - array containing the number of nonzeros in the various rows
623          (possibly different for each row) or NULL
624 
625    Output Parameter:
626 .  A - the matrix
627 
628    Notes:
629    If nnz is given then nz is ignored
630 
631    Level: intermediate
632 
633 .keywords: matrix, cray, sparse, parallel
634 
635 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
636 @*/
637 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
638 {
639   PetscErrorCode ierr;
640 
641   PetscFunctionBegin;
642   ierr = MatCreate(comm,A);CHKERRQ(ierr);
643   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
644   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
645   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
646   PetscFunctionReturn(0);
647 }
648 
649 #undef __FUNCT__
650 #define __FUNCT__ "MatCreate_SeqAIJMKL"
651 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
652 {
653   PetscErrorCode ierr;
654 
655   PetscFunctionBegin;
656   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
657   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
658   PetscFunctionReturn(0);
659 }
660