xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision 19afcda94e2ebad3365cdb198a45e916fe6f5ab5)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool eager_inspection; /* If PETSC_TRUE, then call mkl_sparse_optimize() in MatDuplicate()/MatAssemblyEnd(). */
18   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
19 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
20   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
21   struct matrix_descr descr;
22 #endif
23 } Mat_SeqAIJMKL;
24 
25 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
26 
27 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
28 {
29   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
30   /* so we will ignore 'MatType type'. */
31   PetscErrorCode ierr;
32   Mat            B       = *newmat;
33 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
34   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
35 #endif
36 
37   PetscFunctionBegin;
38   if (reuse == MAT_INITIAL_MATRIX) {
39     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
40   }
41 
42   /* Reset the original function pointers. */
43   B->ops->duplicate        = MatDuplicate_SeqAIJ;
44   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
45   B->ops->destroy          = MatDestroy_SeqAIJ;
46   B->ops->mult             = MatMult_SeqAIJ;
47   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
48   B->ops->multadd          = MatMultAdd_SeqAIJ;
49   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
50   B->ops->scale            = MatScale_SeqAIJ;
51   B->ops->diagonalscale    = MatDiagonalScale_SeqAIJ;
52   B->ops->diagonalset      = MatDiagonalSet_SeqAIJ;
53   B->ops->axpy             = MatAXPY_SeqAIJ;
54 
55   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
56   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
57   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
58   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
59 
60   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
61    * simply involves destroying the MKL sparse matrix handle and then freeing
62    * the spptr pointer. */
63 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
64   if (reuse == MAT_INITIAL_MATRIX) aijmkl = (Mat_SeqAIJMKL*)B->spptr;
65 
66   if (aijmkl->sparse_optimized) {
67     sparse_status_t stat;
68     stat = mkl_sparse_destroy(aijmkl->csrA);
69     if (stat != SPARSE_STATUS_SUCCESS) {
70       PetscFunctionReturn(PETSC_ERR_LIB);
71     }
72   }
73 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
74   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
75 
76   /* Change the type of B to MATSEQAIJ. */
77   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
78 
79   *newmat = B;
80   PetscFunctionReturn(0);
81 }
82 
83 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
84 {
85   PetscErrorCode ierr;
86   Mat_SeqAIJMKL *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
87 
88   PetscFunctionBegin;
89 
90   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
91    * spptr pointer. */
92   if (aijmkl) {
93     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
94 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
95     if (aijmkl->sparse_optimized) {
96       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
97       stat = mkl_sparse_destroy(aijmkl->csrA);
98       if (stat != SPARSE_STATUS_SUCCESS) {
99         PetscFunctionReturn(PETSC_ERR_LIB);
100       }
101     }
102 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
103     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
104   }
105 
106   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
107    * to destroy everything that remains. */
108   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
109   /* Note that I don't call MatSetType().  I believe this is because that
110    * is only to be called when *building* a matrix.  I could be wrong, but
111    * that is how things work for the SuperLU matrix class. */
112   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
113   PetscFunctionReturn(0);
114 }
115 
116 /* MatSeqAIJKL_create_mkl_handle(), if called with an AIJMKL matrix that has not had mkl_sparse_optimize() called for it,
117  * creates an MKL sparse matrix handle from the AIJ arrays and calls mkl_sparse_optimize().
118  * If called with an AIJMKL matrix for which aijmkl->sparse_optimized == PETSC_TRUE, then it destroys the old matrix
119  * handle, creates a new one, and then calls mkl_sparse_optimize().
120  * Although in normal MKL usage it is possible to have a valid matrix handle on which mkl_sparse_optimize() has not been
121  * called, for AIJMKL the handle creation and optimization step always occur together, so we don't handle the case of
122  * an unoptimized matrix handle here. */
123 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_mkl_handle(Mat A)
124 {
125 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
126   /* If the MKL library does not have mkl_sparse_optimize(), then this routine
127    * does nothing. We make it callable anyway in this case because it cuts
128    * down on littering the code with #ifdefs. */
129   PetscFunctionReturn(0);
130 #else
131   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
132   Mat_SeqAIJMKL   *aijmkl = (Mat_SeqAIJMKL*)A->spptr;
133   PetscInt        m,n;
134   MatScalar       *aa;
135   PetscInt        *aj,*ai;
136   sparse_status_t stat;
137 
138   PetscFunctionBegin;
139   if (aijmkl->no_SpMV2) PetscFunctionReturn(0);
140 
141   if (aijmkl->sparse_optimized) {
142     /* Matrix has been previously assembled and optimized. Must destroy old
143      * matrix handle before running the optimization step again. */
144     stat = mkl_sparse_destroy(aijmkl->csrA);
145     if (stat != SPARSE_STATUS_SUCCESS) {
146       PetscFunctionReturn(PETSC_ERR_LIB);
147     }
148   }
149   aijmkl->sparse_optimized = PETSC_FALSE;
150 
151   /* Now perform the SpMV2 setup and matrix optimization. */
152   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
153   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
154   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
155   m = A->rmap->n;
156   n = A->cmap->n;
157   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
158   aa   = a->a;  /* Nonzero elements stored row-by-row. */
159   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
160   if ((a->nz!=0) & !(A->structure_only)) {
161     /* Create a new, optimized sparse matrix handle only if the matrix has nonzero entries.
162      * The MKL sparse-inspector executor routines don't like being passed an empty matrix. */
163     stat = mkl_sparse_x_create_csr(&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
164     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
165     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
166     stat = mkl_sparse_optimize(aijmkl->csrA);
167     if (stat != SPARSE_STATUS_SUCCESS) {
168       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle/complete mkl_sparse_optimize");
169       PetscFunctionReturn(PETSC_ERR_LIB);
170     }
171     aijmkl->sparse_optimized = PETSC_TRUE;
172   }
173 
174   PetscFunctionReturn(0);
175 #endif
176 }
177 
178 /* MatSeqAIJMKL_create_from_mkl_handle() creates a sequential AIJMKL matrix from an MKL sparse matrix handle.
179  * We need this to implement MatMatMult() using the MKL inspector-executor routines, which return an (unoptimized)
180  * matrix handle. */
181 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
182 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_from_mkl_handle(MPI_Comm comm,sparse_matrix_t csrA,Mat *mat)
183 {
184   PetscErrorCode ierr;
185   sparse_status_t stat;
186   sparse_index_base_t indexing;
187   PetscInt nrows, ncols;
188   PetscInt *aj,*ai;
189   MatScalar *aa;
190   Mat A;
191   Mat_SeqAIJMKL *aijmkl;
192 
193   stat = mkl_sparse_x_export_csr(csrA,&indexing,&nrows,&ncols,&ai,NULL,&aj,&aa);
194   if (stat != SPARSE_STATUS_SUCCESS) {
195     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_x_export_csr()");
196     PetscFunctionReturn(PETSC_ERR_LIB);
197   }
198   ierr = MatCreate(comm,&A);CHKERRQ(ierr);
199   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
200   ierr = MatSeqAIJSetPreallocationCSR(A,ai,aj,aa);CHKERRQ(ierr);
201 
202   /* We now have an assembled sequential AIJ matrix created from copies of the exported arrays from the MKL matrix handle.
203    * Now turn it into a MATSEQAIJMKL. */
204   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
205   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
206   aijmkl->csrA = csrA;
207   /* The below code duplicates much of what is in MatSeqAIJKL_create_mkl_handle(). I dislike this code duplication, but
208    * MatSeqAIJMKL_create_mkl_handle() cannot be used because we don't need to create a handle -- we've already got one,
209    * and just need to be able to run the MKL optimization step. */
210   stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
211   stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
212   stat = mkl_sparse_optimize(aijmkl->csrA);
213   if (stat != SPARSE_STATUS_SUCCESS) {
214     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set hints/complete mkl_sparse_optimize");
215     PetscFunctionReturn(PETSC_ERR_LIB);
216   }
217   aijmkl->sparse_optimized = PETSC_TRUE;
218 
219   *mat = A;
220   PetscFunctionReturn(0);
221 }
222 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
223 
224 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
225 {
226   PetscErrorCode ierr;
227   Mat_SeqAIJMKL *aijmkl;
228   Mat_SeqAIJMKL *aijmkl_dest;
229 
230   PetscFunctionBegin;
231   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
232   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
233   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
234   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
235   aijmkl_dest->sparse_optimized = PETSC_FALSE;
236   if (aijmkl->eager_inspection) {
237     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
238   }
239   PetscFunctionReturn(0);
240 }
241 
242 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
243 {
244   PetscErrorCode  ierr;
245   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
246   Mat_SeqAIJMKL *aijmkl;
247 
248   PetscFunctionBegin;
249   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
250 
251   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
252    * extra information and some different methods, call the AssemblyEnd
253    * routine for a MATSEQAIJ.
254    * I'm not sure if this is the best way to do this, but it avoids
255    * a lot of code duplication. */
256   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
257   ierr = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
258 
259   /* If the user has requested "eager" inspection, create the optimized MKL sparse handle (if needed; the function checks).
260    * (The default is to do "lazy" inspection, deferring this until something like MatMult() is called.) */
261   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
262   if (aijmkl->eager_inspection) {
263     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
264   }
265 
266   PetscFunctionReturn(0);
267 }
268 
269 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
270 {
271   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
272   const PetscScalar *x;
273   PetscScalar       *y;
274   const MatScalar   *aa;
275   PetscErrorCode    ierr;
276   PetscInt          m=A->rmap->n;
277   PetscInt          n=A->cmap->n;
278   PetscScalar       alpha = 1.0;
279   PetscScalar       beta = 0.0;
280   const PetscInt    *aj,*ai;
281   char              matdescra[6];
282 
283 
284   /* Variables not in MatMult_SeqAIJ. */
285   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
286 
287   PetscFunctionBegin;
288   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
289   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
290   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
291   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
292   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
293   aa   = a->a;  /* Nonzero elements stored row-by-row. */
294   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
295 
296   /* Call MKL sparse BLAS routine to do the MatMult. */
297   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
298 
299   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
300   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
301   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
302   PetscFunctionReturn(0);
303 }
304 
305 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
306 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
307 {
308   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
309   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
310   const PetscScalar *x;
311   PetscScalar       *y;
312   PetscErrorCode    ierr;
313   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
314 
315   PetscFunctionBegin;
316 
317   /* If there are no nonzero entries, zero yy and return immediately. */
318   if(!a->nz) {
319     PetscInt i;
320     PetscInt m=A->rmap->n;
321     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
322     for (i=0; i<m; i++) {
323       y[i] = 0.0;
324     }
325     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
326     PetscFunctionReturn(0);
327   }
328 
329   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
330   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
331 
332   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
333    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
334    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
335   if (!aijmkl->sparse_optimized) {
336     MatSeqAIJMKL_create_mkl_handle(A);
337   }
338 
339   /* Call MKL SpMV2 executor routine to do the MatMult. */
340   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
341 
342   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
343   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
344   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
345   if (stat != SPARSE_STATUS_SUCCESS) {
346     PetscFunctionReturn(PETSC_ERR_LIB);
347   }
348   PetscFunctionReturn(0);
349 }
350 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
351 
352 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
353 {
354   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
355   const PetscScalar *x;
356   PetscScalar       *y;
357   const MatScalar   *aa;
358   PetscErrorCode    ierr;
359   PetscInt          m=A->rmap->n;
360   PetscInt          n=A->cmap->n;
361   PetscScalar       alpha = 1.0;
362   PetscScalar       beta = 0.0;
363   const PetscInt    *aj,*ai;
364   char              matdescra[6];
365 
366   /* Variables not in MatMultTranspose_SeqAIJ. */
367   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
368 
369   PetscFunctionBegin;
370   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
371   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
372   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
373   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
374   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
375   aa   = a->a;  /* Nonzero elements stored row-by-row. */
376   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
377 
378   /* Call MKL sparse BLAS routine to do the MatMult. */
379   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
380 
381   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
382   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
383   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
384   PetscFunctionReturn(0);
385 }
386 
387 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
388 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
389 {
390   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
391   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
392   const PetscScalar *x;
393   PetscScalar       *y;
394   PetscErrorCode    ierr;
395   sparse_status_t   stat;
396 
397   PetscFunctionBegin;
398 
399   /* If there are no nonzero entries, zero yy and return immediately. */
400   if(!a->nz) {
401     PetscInt i;
402     PetscInt n=A->cmap->n;
403     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
404     for (i=0; i<n; i++) {
405       y[i] = 0.0;
406     }
407     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
408     PetscFunctionReturn(0);
409   }
410 
411   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
412   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
413 
414   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
415    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
416    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
417   if (!aijmkl->sparse_optimized) {
418     MatSeqAIJMKL_create_mkl_handle(A);
419   }
420 
421   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
422   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
423 
424   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
425   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
426   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
427   if (stat != SPARSE_STATUS_SUCCESS) {
428     PetscFunctionReturn(PETSC_ERR_LIB);
429   }
430   PetscFunctionReturn(0);
431 }
432 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
433 
434 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
435 {
436   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
437   const PetscScalar *x;
438   PetscScalar       *y,*z;
439   const MatScalar   *aa;
440   PetscErrorCode    ierr;
441   PetscInt          m=A->rmap->n;
442   PetscInt          n=A->cmap->n;
443   const PetscInt    *aj,*ai;
444   PetscInt          i;
445 
446   /* Variables not in MatMultAdd_SeqAIJ. */
447   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
448   PetscScalar       alpha = 1.0;
449   PetscScalar       beta;
450   char              matdescra[6];
451 
452   PetscFunctionBegin;
453   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
454   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
455 
456   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
457   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
458   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
459   aa   = a->a;  /* Nonzero elements stored row-by-row. */
460   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
461 
462   /* Call MKL sparse BLAS routine to do the MatMult. */
463   if (zz == yy) {
464     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
465     beta = 1.0;
466     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
467   } else {
468     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
469      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
470     beta = 0.0;
471     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
472     for (i=0; i<m; i++) {
473       z[i] += y[i];
474     }
475   }
476 
477   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
478   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
479   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
480   PetscFunctionReturn(0);
481 }
482 
483 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
484 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
485 {
486   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
487   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
488   const PetscScalar *x;
489   PetscScalar       *y,*z;
490   PetscErrorCode    ierr;
491   PetscInt          m=A->rmap->n;
492   PetscInt          i;
493 
494   /* Variables not in MatMultAdd_SeqAIJ. */
495   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
496 
497   PetscFunctionBegin;
498 
499   /* If there are no nonzero entries, set zz = yy and return immediately. */
500   if(!a->nz) {
501     PetscInt i;
502     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
503     for (i=0; i<m; i++) {
504       z[i] = y[i];
505     }
506     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
507     PetscFunctionReturn(0);
508   }
509 
510   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
511   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
512 
513   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
514    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
515    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
516   if (!aijmkl->sparse_optimized) {
517     MatSeqAIJMKL_create_mkl_handle(A);
518   }
519 
520   /* Call MKL sparse BLAS routine to do the MatMult. */
521   if (zz == yy) {
522     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
523      * with alpha and beta both set to 1.0. */
524     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
525   } else {
526     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
527      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
528     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
529     for (i=0; i<m; i++) {
530       z[i] += y[i];
531     }
532   }
533 
534   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
535   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
536   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
537   if (stat != SPARSE_STATUS_SUCCESS) {
538     PetscFunctionReturn(PETSC_ERR_LIB);
539   }
540   PetscFunctionReturn(0);
541 }
542 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
543 
544 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
545 {
546   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
547   const PetscScalar *x;
548   PetscScalar       *y,*z;
549   const MatScalar   *aa;
550   PetscErrorCode    ierr;
551   PetscInt          m=A->rmap->n;
552   PetscInt          n=A->cmap->n;
553   const PetscInt    *aj,*ai;
554   PetscInt          i;
555 
556   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
557   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
558   PetscScalar       alpha = 1.0;
559   PetscScalar       beta;
560   char              matdescra[6];
561 
562   PetscFunctionBegin;
563   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
564   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
565 
566   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
567   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
568   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
569   aa   = a->a;  /* Nonzero elements stored row-by-row. */
570   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
571 
572   /* Call MKL sparse BLAS routine to do the MatMult. */
573   if (zz == yy) {
574     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
575     beta = 1.0;
576     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
577   } else {
578     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
579      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
580     beta = 0.0;
581     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
582     for (i=0; i<n; i++) {
583       z[i] += y[i];
584     }
585   }
586 
587   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
588   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
589   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
590   PetscFunctionReturn(0);
591 }
592 
593 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
594 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
595 {
596   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
597   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
598   const PetscScalar *x;
599   PetscScalar       *y,*z;
600   PetscErrorCode    ierr;
601   PetscInt          n=A->cmap->n;
602   PetscInt          i;
603 
604   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
605   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
606 
607   PetscFunctionBegin;
608 
609   /* If there are no nonzero entries, set zz = yy and return immediately. */
610   if(!a->nz) {
611     PetscInt i;
612     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
613     for (i=0; i<n; i++) {
614       z[i] = y[i];
615     }
616     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
617     PetscFunctionReturn(0);
618   }
619 
620   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
621   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
622 
623   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
624    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
625    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
626   if (!aijmkl->sparse_optimized) {
627     MatSeqAIJMKL_create_mkl_handle(A);
628   }
629 
630   /* Call MKL sparse BLAS routine to do the MatMult. */
631   if (zz == yy) {
632     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
633      * with alpha and beta both set to 1.0. */
634     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
635   } else {
636     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
637      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
638     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
639     for (i=0; i<n; i++) {
640       z[i] += y[i];
641     }
642   }
643 
644   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
645   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
646   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
647   if (stat != SPARSE_STATUS_SUCCESS) {
648     PetscFunctionReturn(PETSC_ERR_LIB);
649   }
650   PetscFunctionReturn(0);
651 }
652 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
653 
654 PetscErrorCode MatScale_SeqAIJMKL(Mat inA,PetscScalar alpha)
655 {
656   PetscErrorCode ierr;
657 
658   PetscFunctionBegin;
659   ierr = MatScale_SeqAIJ(inA,alpha);CHKERRQ(ierr);
660   ierr = MatSeqAIJMKL_create_mkl_handle(inA);CHKERRQ(ierr);
661   PetscFunctionReturn(0);
662 }
663 
664 PetscErrorCode MatDiagonalScale_SeqAIJMKL(Mat A,Vec ll,Vec rr)
665 {
666   PetscErrorCode ierr;
667 
668   PetscFunctionBegin;
669   ierr = MatDiagonalScale_SeqAIJ(A,ll,rr);CHKERRQ(ierr);
670   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
671   PetscFunctionReturn(0);
672 }
673 
674 PetscErrorCode MatDiagonalSet_SeqAIJMKL(Mat Y,Vec D,InsertMode is)
675 {
676   PetscErrorCode ierr;
677 
678   PetscFunctionBegin;
679   ierr = MatDiagonalSet_SeqAIJ(Y,D,is);CHKERRQ(ierr);
680   ierr = MatSeqAIJMKL_create_mkl_handle(Y);CHKERRQ(ierr);
681   PetscFunctionReturn(0);
682 }
683 
684 PetscErrorCode MatAXPY_SeqAIJMKL(Mat Y,PetscScalar a,Mat X,MatStructure str)
685 {
686   PetscErrorCode ierr;
687 
688   PetscFunctionBegin;
689   ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
690   if (str == SAME_NONZERO_PATTERN) {
691     /* MatAssemblyEnd() is not called if SAME_NONZERO_PATTERN, so we need to force update of the MKL matrix handle. */
692     ierr = MatSeqAIJMKL_create_mkl_handle(Y);CHKERRQ(ierr);
693   }
694   PetscFunctionReturn(0);
695 }
696 
697 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
698  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
699  * routine, but can also be used to convert an assembled SeqAIJ matrix
700  * into a SeqAIJMKL one. */
701 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
702 {
703   PetscErrorCode ierr;
704   Mat            B = *newmat;
705   Mat_SeqAIJMKL  *aijmkl;
706   PetscBool      set;
707   PetscBool      sametype;
708 
709   PetscFunctionBegin;
710   if (reuse == MAT_INITIAL_MATRIX) {
711     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
712   }
713 
714   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
715   if (sametype) PetscFunctionReturn(0);
716 
717   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
718   B->spptr = (void*) aijmkl;
719 
720   /* Set function pointers for methods that we inherit from AIJ but override.
721    * We also parse some command line options below, since those determine some of the methods we point to. */
722   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
723   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
724   B->ops->destroy          = MatDestroy_SeqAIJMKL;
725 
726   aijmkl->sparse_optimized = PETSC_FALSE;
727 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
728   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
729 #else
730   aijmkl->no_SpMV2 = PETSC_TRUE;
731 #endif
732   aijmkl->eager_inspection = PETSC_FALSE;
733 
734   /* Parse command line options. */
735   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
736   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
737   ierr = PetscOptionsBool("-mat_aijmkl_eager_inspection","Eager Inspection","None",(PetscBool)aijmkl->eager_inspection,(PetscBool*)&aijmkl->eager_inspection,&set);CHKERRQ(ierr);
738   ierr = PetscOptionsEnd();CHKERRQ(ierr);
739 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
740   if(!aijmkl->no_SpMV2) {
741     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
742     aijmkl->no_SpMV2 = PETSC_TRUE;
743   }
744 #endif
745 
746   if(!aijmkl->no_SpMV2) {
747 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
748     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
749     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2;
750     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
751     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2;
752 #endif
753   } else {
754     B->ops->mult             = MatMult_SeqAIJMKL;
755     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL;
756     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
757     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL;
758   }
759 
760   B->ops->scale              = MatScale_SeqAIJMKL;
761   B->ops->diagonalscale      = MatDiagonalScale_SeqAIJMKL;
762   B->ops->diagonalset        = MatDiagonalSet_SeqAIJMKL;
763   B->ops->axpy               = MatAXPY_SeqAIJMKL;
764 
765   ierr = PetscObjectComposeFunction((PetscObject)B,"MatScale_SeqAIJMKL_C",MatScale_SeqAIJMKL);CHKERRQ(ierr);
766   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
767   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
768   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
769   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
770 
771   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
772   *newmat = B;
773   PetscFunctionReturn(0);
774 }
775 
776 /*@C
777    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
778    This type inherits from AIJ and is largely identical, but uses sparse BLAS
779    routines from Intel MKL whenever possible.
780    MatMult, MatMultAdd, MatMultTranspose, and MatMultTransposeAdd
781    operations are currently supported.
782    If the installed version of MKL supports the "SpMV2" sparse
783    inspector-executor routines, then those are used by default.
784 
785    Collective on MPI_Comm
786 
787    Input Parameters:
788 +  comm - MPI communicator, set to PETSC_COMM_SELF
789 .  m - number of rows
790 .  n - number of columns
791 .  nz - number of nonzeros per row (same for all rows)
792 -  nnz - array containing the number of nonzeros in the various rows
793          (possibly different for each row) or NULL
794 
795    Output Parameter:
796 .  A - the matrix
797 
798    Options Database Keys:
799 .  -mat_aijmkl_no_spmv2 - disables use of the SpMV2 inspector-executor routines
800 
801    Notes:
802    If nnz is given then nz is ignored
803 
804    Level: intermediate
805 
806 .keywords: matrix, MKL, sparse, parallel
807 
808 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
809 @*/
810 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
811 {
812   PetscErrorCode ierr;
813 
814   PetscFunctionBegin;
815   ierr = MatCreate(comm,A);CHKERRQ(ierr);
816   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
817   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
818   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
819   PetscFunctionReturn(0);
820 }
821 
822 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
823 {
824   PetscErrorCode ierr;
825 
826   PetscFunctionBegin;
827   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
828   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
829   PetscFunctionReturn(0);
830 }
831