xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision e995cf246b4a206b0e422988b4108f6cdfca847f)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool eager_inspection; /* If PETSC_TRUE, then call mkl_sparse_optimize() in MatDuplicate()/MatAssemblyEnd(). */
18   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
19   PetscObjectState state;
20 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
21   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
22   struct matrix_descr descr;
23 #endif
24 } Mat_SeqAIJMKL;
25 
26 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
27 
28 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
29 {
30   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
31   /* so we will ignore 'MatType type'. */
32   PetscErrorCode ierr;
33   Mat            B       = *newmat;
34 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
35   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
36 #endif
37 
38   PetscFunctionBegin;
39   if (reuse == MAT_INITIAL_MATRIX) {
40     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
41   }
42 
43   /* Reset the original function pointers. */
44   B->ops->duplicate        = MatDuplicate_SeqAIJ;
45   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
46   B->ops->destroy          = MatDestroy_SeqAIJ;
47   B->ops->mult             = MatMult_SeqAIJ;
48   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
49   B->ops->multadd          = MatMultAdd_SeqAIJ;
50   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
51   B->ops->matmult          = MatMatMult_SeqAIJ_SeqAIJ;
52   B->ops->matmultnumeric   = MatMatMultNumeric_SeqAIJ_SeqAIJ;
53   B->ops->ptap             = MatPtAP_SeqAIJ_SeqAIJ;
54   B->ops->ptapnumeric      = MatPtAPNumeric_SeqAIJ_SeqAIJ;
55   B->ops->transposematmult = MatTransposeMatMult_SeqAIJ_SeqAIJ;
56   B->ops->scale            = MatScale_SeqAIJ;
57   B->ops->diagonalscale    = MatDiagonalScale_SeqAIJ;
58   B->ops->diagonalset      = MatDiagonalSet_SeqAIJ;
59   B->ops->axpy             = MatAXPY_SeqAIJ;
60 
61   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
62   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
63   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
64   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
65 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
66   if(!aijmkl->no_SpMV2) {
67     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
68 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
69     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
70 #endif
71     ierr = PetscObjectComposeFunction((PetscObject)B,"MatTransposeMatMult_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
72   }
73 
74   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
75    * simply involves destroying the MKL sparse matrix handle and then freeing
76    * the spptr pointer. */
77   if (reuse == MAT_INITIAL_MATRIX) aijmkl = (Mat_SeqAIJMKL*)B->spptr;
78 
79   if (aijmkl->sparse_optimized) {
80     sparse_status_t stat;
81     stat = mkl_sparse_destroy(aijmkl->csrA);
82     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set hints/complete mkl_sparse_optimize");
83   }
84 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
85   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
86 
87   /* Change the type of B to MATSEQAIJ. */
88   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
89 
90   *newmat = B;
91   PetscFunctionReturn(0);
92 }
93 
94 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
95 {
96   PetscErrorCode ierr;
97   Mat_SeqAIJMKL  *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
98 
99   PetscFunctionBegin;
100 
101   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
102    * spptr pointer. */
103   if (aijmkl) {
104     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
105 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
106     if (aijmkl->sparse_optimized) {
107       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
108       stat = mkl_sparse_destroy(aijmkl->csrA);
109       if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_destroy");
110     }
111 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
112     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
113   }
114 
115   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
116    * to destroy everything that remains. */
117   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
118   /* Note that I don't call MatSetType().  I believe this is because that
119    * is only to be called when *building* a matrix.  I could be wrong, but
120    * that is how things work for the SuperLU matrix class. */
121   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
122   PetscFunctionReturn(0);
123 }
124 
125 /* MatSeqAIJKL_create_mkl_handle(), if called with an AIJMKL matrix that has not had mkl_sparse_optimize() called for it,
126  * creates an MKL sparse matrix handle from the AIJ arrays and calls mkl_sparse_optimize().
127  * If called with an AIJMKL matrix for which aijmkl->sparse_optimized == PETSC_TRUE, then it destroys the old matrix
128  * handle, creates a new one, and then calls mkl_sparse_optimize().
129  * Although in normal MKL usage it is possible to have a valid matrix handle on which mkl_sparse_optimize() has not been
130  * called, for AIJMKL the handle creation and optimization step always occur together, so we don't handle the case of
131  * an unoptimized matrix handle here. */
132 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_mkl_handle(Mat A)
133 {
134 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
135   /* If the MKL library does not have mkl_sparse_optimize(), then this routine
136    * does nothing. We make it callable anyway in this case because it cuts
137    * down on littering the code with #ifdefs. */
138   PetscFunctionBegin;
139   PetscFunctionReturn(0);
140 #else
141   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
142   Mat_SeqAIJMKL    *aijmkl = (Mat_SeqAIJMKL*)A->spptr;
143   PetscInt         m,n;
144   MatScalar        *aa;
145   PetscInt         *aj,*ai;
146   sparse_status_t  stat;
147   PetscErrorCode   ierr;
148 
149   PetscFunctionBegin;
150   if (aijmkl->no_SpMV2) PetscFunctionReturn(0);
151 
152   if (aijmkl->sparse_optimized) {
153     /* Matrix has been previously assembled and optimized. Must destroy old
154      * matrix handle before running the optimization step again. */
155     stat = mkl_sparse_destroy(aijmkl->csrA);
156     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_destroy");
157   }
158   aijmkl->sparse_optimized = PETSC_FALSE;
159 
160   /* Now perform the SpMV2 setup and matrix optimization. */
161   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
162   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
163   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
164   m = A->rmap->n;
165   n = A->cmap->n;
166   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
167   aa   = a->a;  /* Nonzero elements stored row-by-row. */
168   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
169   if ((a->nz!=0) & !(A->structure_only)) {
170     /* Create a new, optimized sparse matrix handle only if the matrix has nonzero entries.
171      * The MKL sparse-inspector executor routines don't like being passed an empty matrix. */
172     stat = mkl_sparse_x_create_csr(&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
173     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle");
174     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
175     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set mv_hint");
176     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
177     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set memory_hint");
178     stat = mkl_sparse_optimize(aijmkl->csrA);
179     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_optimize");
180     aijmkl->sparse_optimized = PETSC_TRUE;
181     ierr = PetscObjectStateGet((PetscObject)A,&(aijmkl->state));CHKERRQ(ierr);
182   }
183 
184   PetscFunctionReturn(0);
185 #endif
186 }
187 
188 /* MatSeqAIJMKL_create_from_mkl_handle() creates a sequential AIJMKL matrix from an MKL sparse matrix handle.
189  * We need this to implement MatMatMult() using the MKL inspector-executor routines, which return an (unoptimized)
190  * matrix handle.
191  * Note: This routine simply destroys and replaces the original matrix if MAT_REUSE_MATRIX has been specified, as
192  * there is no good alternative. */
193 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
194 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_from_mkl_handle(MPI_Comm comm,sparse_matrix_t csrA,MatReuse reuse,Mat *mat)
195 {
196   PetscErrorCode      ierr;
197   sparse_status_t     stat;
198   sparse_index_base_t indexing;
199   PetscInt            nrows, ncols;
200   PetscInt            *aj,*ai,*dummy;
201   MatScalar           *aa;
202   Mat                 A;
203   Mat_SeqAIJMKL       *aijmkl;
204 
205   /* Note: Must pass in &dummy below since MKL can't accept NULL for this output array we don't actually want. */
206   stat = mkl_sparse_x_export_csr(csrA,&indexing,&nrows,&ncols,&ai,&dummy,&aj,&aa);
207   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_x_export_csr()");
208 
209   if (reuse == MAT_REUSE_MATRIX) {
210     ierr = MatDestroy(mat);CHKERRQ(ierr);
211   }
212   ierr = MatCreate(comm,&A);CHKERRQ(ierr);
213   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
214   ierr = MatSetSizes(A,PETSC_DECIDE,PETSC_DECIDE,nrows,ncols);CHKERRQ(ierr);
215   /* We use MatSeqAIJSetPreallocationCSR() instead of MatCreateSeqAIJWithArrays() because we must copy the arrays exported
216    * from MKL; MKL developers tell us that modifying the arrays may cause unexpected results when using the MKL handle, and
217    * they will be destroyed when the MKL handle is destroyed.
218    * (In the interest of reducing memory consumption in future, can we figure out good ways to deal with this?) */
219   ierr = MatSeqAIJSetPreallocationCSR(A,ai,aj,aa);CHKERRQ(ierr);
220 
221   /* We now have an assembled sequential AIJ matrix created from copies of the exported arrays from the MKL matrix handle.
222    * Now turn it into a MATSEQAIJMKL. */
223   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
224 
225   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
226   aijmkl->csrA = csrA;
227 
228   /* The below code duplicates much of what is in MatSeqAIJKL_create_mkl_handle(). I dislike this code duplication, but
229    * MatSeqAIJMKL_create_mkl_handle() cannot be used because we don't need to create a handle -- we've already got one,
230    * and just need to be able to run the MKL optimization step. */
231   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
232   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
233   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
234   stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
235   stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
236   stat = mkl_sparse_optimize(aijmkl->csrA);
237   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set hints/complete mkl_sparse_optimize");
238   aijmkl->sparse_optimized = PETSC_TRUE;
239   ierr = PetscObjectStateGet((PetscObject)A,&(aijmkl->state));CHKERRQ(ierr);
240 
241   *mat = A;
242   PetscFunctionReturn(0);
243 }
244 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
245 
246 /* MatSeqAIJMKL_update_from_mkl_handle() updates the matrix values array from the contents of the associated MKL sparse matrix handle.
247  * This is needed after mkl_sparse_sp2m() with SPARSE_STAGE_FINALIZE_MULT has been used to compute new values of the matrix in
248  * MatMatMultNumeric(). */
249 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
250 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_update_from_mkl_handle(Mat A)
251 {
252   PetscInt            i;
253   PetscInt            nrows,ncols;
254   PetscInt            nz;
255   PetscInt            *ai,*aj,*dummy;
256   PetscScalar         *aa;
257   PetscErrorCode      ierr;
258   Mat_SeqAIJMKL       *aijmkl;
259   sparse_status_t     stat;
260   sparse_index_base_t indexing;
261 
262   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
263 
264   /* Note: Must pass in &dummy below since MKL can't accept NULL for this output array we don't actually want. */
265   stat = mkl_sparse_x_export_csr(aijmkl->csrA,&indexing,&nrows,&ncols,&ai,&dummy,&aj,&aa);
266   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_x_export_csr()");
267 
268   /* We can't just do a copy from the arrays exported by MKL to those used for the PETSc AIJ storage, because the MKL and PETSc
269    * representations differ in small ways (e.g., more explicit nonzeros per row due to preallocation). */
270   for (i=0; i<nrows; i++) {
271     nz = ai[i+1] - ai[i];
272     ierr = MatSetValues_SeqAIJ(A, 1, &i, nz, aj+ai[i], aa+ai[i], INSERT_VALUES);CHKERRQ(ierr);
273   }
274 
275   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
276   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
277 
278   ierr = PetscObjectStateGet((PetscObject)A,&(aijmkl->state));CHKERRQ(ierr);
279   /* We mark our matrix as having a valid, optimized MKL handle.
280    * TODO: It is valid, but I am not sure if it is optimized. Need to ask MKL developers. */
281   aijmkl->sparse_optimized = PETSC_TRUE;
282 
283   PetscFunctionReturn(0);
284 }
285 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
286 
287 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
288 {
289   PetscErrorCode ierr;
290   Mat_SeqAIJMKL  *aijmkl;
291   Mat_SeqAIJMKL  *aijmkl_dest;
292 
293   PetscFunctionBegin;
294   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
295   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
296   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
297   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
298   aijmkl_dest->sparse_optimized = PETSC_FALSE;
299   if (aijmkl->eager_inspection) {
300     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
301   }
302   PetscFunctionReturn(0);
303 }
304 
305 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
306 {
307   PetscErrorCode  ierr;
308   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
309   Mat_SeqAIJMKL   *aijmkl;
310 
311   PetscFunctionBegin;
312   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
313 
314   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
315    * extra information and some different methods, call the AssemblyEnd
316    * routine for a MATSEQAIJ.
317    * I'm not sure if this is the best way to do this, but it avoids
318    * a lot of code duplication. */
319   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
320   ierr = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
321 
322   /* If the user has requested "eager" inspection, create the optimized MKL sparse handle (if needed; the function checks).
323    * (The default is to do "lazy" inspection, deferring this until something like MatMult() is called.) */
324   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
325   if (aijmkl->eager_inspection) {
326     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
327   }
328 
329   PetscFunctionReturn(0);
330 }
331 
332 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
333 {
334   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
335   const PetscScalar *x;
336   PetscScalar       *y;
337   const MatScalar   *aa;
338   PetscErrorCode    ierr;
339   PetscInt          m=A->rmap->n;
340   PetscInt          n=A->cmap->n;
341   PetscScalar       alpha = 1.0;
342   PetscScalar       beta = 0.0;
343   const PetscInt    *aj,*ai;
344   char              matdescra[6];
345 
346 
347   /* Variables not in MatMult_SeqAIJ. */
348   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
349 
350   PetscFunctionBegin;
351   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
352   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
353   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
354   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
355   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
356   aa   = a->a;  /* Nonzero elements stored row-by-row. */
357   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
358 
359   /* Call MKL sparse BLAS routine to do the MatMult. */
360   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
361 
362   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
363   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
364   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
365   PetscFunctionReturn(0);
366 }
367 
368 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
369 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
370 {
371   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
372   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
373   const PetscScalar *x;
374   PetscScalar       *y;
375   PetscErrorCode    ierr;
376   sparse_status_t   stat = SPARSE_STATUS_SUCCESS;
377   PetscObjectState  state;
378 
379   PetscFunctionBegin;
380 
381   /* If there are no nonzero entries, zero yy and return immediately. */
382   if(!a->nz) {
383     PetscInt i;
384     PetscInt m=A->rmap->n;
385     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
386     for (i=0; i<m; i++) {
387       y[i] = 0.0;
388     }
389     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
390     PetscFunctionReturn(0);
391   }
392 
393   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
394   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
395 
396   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
397    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
398    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
399   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
400   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
401     MatSeqAIJMKL_create_mkl_handle(A);
402   }
403 
404   /* Call MKL SpMV2 executor routine to do the MatMult. */
405   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
406   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
407 
408   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
409   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
410   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
411   PetscFunctionReturn(0);
412 }
413 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
414 
415 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
416 {
417   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
418   const PetscScalar *x;
419   PetscScalar       *y;
420   const MatScalar   *aa;
421   PetscErrorCode    ierr;
422   PetscInt          m=A->rmap->n;
423   PetscInt          n=A->cmap->n;
424   PetscScalar       alpha = 1.0;
425   PetscScalar       beta = 0.0;
426   const PetscInt    *aj,*ai;
427   char              matdescra[6];
428 
429   /* Variables not in MatMultTranspose_SeqAIJ. */
430   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
431 
432   PetscFunctionBegin;
433   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
434   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
435   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
436   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
437   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
438   aa   = a->a;  /* Nonzero elements stored row-by-row. */
439   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
440 
441   /* Call MKL sparse BLAS routine to do the MatMult. */
442   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
443 
444   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
445   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
446   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
447   PetscFunctionReturn(0);
448 }
449 
450 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
451 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
452 {
453   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
454   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
455   const PetscScalar *x;
456   PetscScalar       *y;
457   PetscErrorCode    ierr;
458   sparse_status_t   stat;
459   PetscObjectState  state;
460 
461   PetscFunctionBegin;
462 
463   /* If there are no nonzero entries, zero yy and return immediately. */
464   if(!a->nz) {
465     PetscInt i;
466     PetscInt n=A->cmap->n;
467     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
468     for (i=0; i<n; i++) {
469       y[i] = 0.0;
470     }
471     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
472     PetscFunctionReturn(0);
473   }
474 
475   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
476   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
477 
478   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
479    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
480    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
481   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
482   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
483     MatSeqAIJMKL_create_mkl_handle(A);
484   }
485 
486   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
487   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
488   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
489 
490   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
491   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
492   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
493   PetscFunctionReturn(0);
494 }
495 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
496 
497 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
498 {
499   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
500   const PetscScalar *x;
501   PetscScalar       *y,*z;
502   const MatScalar   *aa;
503   PetscErrorCode    ierr;
504   PetscInt          m=A->rmap->n;
505   PetscInt          n=A->cmap->n;
506   const PetscInt    *aj,*ai;
507   PetscInt          i;
508 
509   /* Variables not in MatMultAdd_SeqAIJ. */
510   char              transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
511   PetscScalar       alpha = 1.0;
512   PetscScalar       beta;
513   char              matdescra[6];
514 
515   PetscFunctionBegin;
516   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
517   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
518 
519   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
520   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
521   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
522   aa   = a->a;  /* Nonzero elements stored row-by-row. */
523   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
524 
525   /* Call MKL sparse BLAS routine to do the MatMult. */
526   if (zz == yy) {
527     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
528     beta = 1.0;
529     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
530   } else {
531     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
532      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
533     beta = 0.0;
534     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
535     for (i=0; i<m; i++) {
536       z[i] += y[i];
537     }
538   }
539 
540   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
541   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
542   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
543   PetscFunctionReturn(0);
544 }
545 
546 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
547 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
548 {
549   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
550   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
551   const PetscScalar *x;
552   PetscScalar       *y,*z;
553   PetscErrorCode    ierr;
554   PetscInt          m=A->rmap->n;
555   PetscInt          i;
556 
557   /* Variables not in MatMultAdd_SeqAIJ. */
558   sparse_status_t   stat = SPARSE_STATUS_SUCCESS;
559   PetscObjectState  state;
560 
561   PetscFunctionBegin;
562 
563   /* If there are no nonzero entries, set zz = yy and return immediately. */
564   if(!a->nz) {
565     PetscInt i;
566     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
567     for (i=0; i<m; i++) {
568       z[i] = y[i];
569     }
570     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
571     PetscFunctionReturn(0);
572   }
573 
574   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
575   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
576 
577   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
578    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
579    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
580   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
581   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
582     MatSeqAIJMKL_create_mkl_handle(A);
583   }
584 
585   /* Call MKL sparse BLAS routine to do the MatMult. */
586   if (zz == yy) {
587     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
588      * with alpha and beta both set to 1.0. */
589     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
590     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
591   } else {
592     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
593      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
594     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
595     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
596     for (i=0; i<m; i++) {
597       z[i] += y[i];
598     }
599   }
600 
601   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
602   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
603   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
604   PetscFunctionReturn(0);
605 }
606 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
607 
608 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
609 {
610   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
611   const PetscScalar *x;
612   PetscScalar       *y,*z;
613   const MatScalar   *aa;
614   PetscErrorCode    ierr;
615   PetscInt          m=A->rmap->n;
616   PetscInt          n=A->cmap->n;
617   const PetscInt    *aj,*ai;
618   PetscInt          i;
619 
620   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
621   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
622   PetscScalar       alpha = 1.0;
623   PetscScalar       beta;
624   char              matdescra[6];
625 
626   PetscFunctionBegin;
627   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
628   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
629 
630   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
631   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
632   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
633   aa   = a->a;  /* Nonzero elements stored row-by-row. */
634   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
635 
636   /* Call MKL sparse BLAS routine to do the MatMult. */
637   if (zz == yy) {
638     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
639     beta = 1.0;
640     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
641   } else {
642     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
643      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
644     beta = 0.0;
645     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
646     for (i=0; i<n; i++) {
647       z[i] += y[i];
648     }
649   }
650 
651   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
652   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
653   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
654   PetscFunctionReturn(0);
655 }
656 
657 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
658 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
659 {
660   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
661   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
662   const PetscScalar *x;
663   PetscScalar       *y,*z;
664   PetscErrorCode    ierr;
665   PetscInt          n=A->cmap->n;
666   PetscInt          i;
667   PetscObjectState  state;
668 
669   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
670   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
671 
672   PetscFunctionBegin;
673 
674   /* If there are no nonzero entries, set zz = yy and return immediately. */
675   if(!a->nz) {
676     PetscInt i;
677     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
678     for (i=0; i<n; i++) {
679       z[i] = y[i];
680     }
681     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
682     PetscFunctionReturn(0);
683   }
684 
685   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
686   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
687 
688   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
689    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
690    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
691   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
692   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
693     MatSeqAIJMKL_create_mkl_handle(A);
694   }
695 
696   /* Call MKL sparse BLAS routine to do the MatMult. */
697   if (zz == yy) {
698     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
699      * with alpha and beta both set to 1.0. */
700     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
701     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
702   } else {
703     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
704      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
705     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
706     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
707     for (i=0; i<n; i++) {
708       z[i] += y[i];
709     }
710   }
711 
712   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
713   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
714   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
715   PetscFunctionReturn(0);
716 }
717 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
718 
719 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
720 /* Note that this code currently doesn't actually get used when MatMatMult() is called with MAT_REUSE_MATRIX, because
721  * the MatMatMult() interface code calls MatMatMultNumeric() in this case.
722  * For releases of MKL prior to version 18, update 2:
723  * MKL has no notion of separately callable symbolic vs. numeric phases of sparse matrix-matrix multiply, so in the
724  * MAT_REUSE_MATRIX case, the SeqAIJ routines end up being used. Even though this means that the (hopefully more
725  * optimized) MKL routines do not get used, this probably is best because the MKL routines would waste time re-computing
726  * the symbolic portion, whereas the native PETSc SeqAIJ routines will avoid this. */
727 PetscErrorCode MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat*C)
728 {
729   Mat_SeqAIJMKL    *a, *b;
730   sparse_matrix_t  csrA, csrB, csrC;
731   PetscErrorCode   ierr;
732   sparse_status_t  stat = SPARSE_STATUS_SUCCESS;
733   PetscObjectState state;
734 
735   PetscFunctionBegin;
736   a = (Mat_SeqAIJMKL*)A->spptr;
737   b = (Mat_SeqAIJMKL*)B->spptr;
738   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
739   if (!a->sparse_optimized || a->state != state) {
740     MatSeqAIJMKL_create_mkl_handle(A);
741   }
742   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
743   if (!b->sparse_optimized || b->state != state) {
744     MatSeqAIJMKL_create_mkl_handle(B);
745   }
746   csrA = a->csrA;
747   csrB = b->csrA;
748 
749   stat = mkl_sparse_spmm(SPARSE_OPERATION_NON_TRANSPOSE,csrA,csrB,&csrC);
750   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete sparse matrix-matrix multiply");
751 
752   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
753 
754   PetscFunctionReturn(0);
755 }
756 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
757 
758 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
759 PetscErrorCode MatMatMultNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,Mat C)
760 {
761   Mat_SeqAIJMKL       *a, *b, *c;
762   sparse_matrix_t     csrA, csrB, csrC;
763   PetscErrorCode      ierr;
764   sparse_status_t     stat = SPARSE_STATUS_SUCCESS;
765   struct matrix_descr descr_type_gen;
766   PetscObjectState    state;
767 
768   PetscFunctionBegin;
769   a = (Mat_SeqAIJMKL*)A->spptr;
770   b = (Mat_SeqAIJMKL*)B->spptr;
771   c = (Mat_SeqAIJMKL*)C->spptr;
772   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
773   if (!a->sparse_optimized || a->state != state) {
774     MatSeqAIJMKL_create_mkl_handle(A);
775   }
776   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
777   if (!b->sparse_optimized || b->state != state) {
778     MatSeqAIJMKL_create_mkl_handle(B);
779   }
780   csrA = a->csrA;
781   csrB = b->csrA;
782   csrC = c->csrA;
783   descr_type_gen.type = SPARSE_MATRIX_TYPE_GENERAL;
784 
785   stat = mkl_sparse_sp2m(SPARSE_OPERATION_NON_TRANSPOSE,descr_type_gen,csrA,
786                          SPARSE_OPERATION_NON_TRANSPOSE,descr_type_gen,csrB,
787                          SPARSE_STAGE_FINALIZE_MULT,&csrC);
788 
789   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete numerical stage of sparse matrix-matrix multiply");
790 
791   /* Have to update the PETSc AIJ representation for matrix C from contents of MKL handle. */
792   ierr = MatSeqAIJMKL_update_from_mkl_handle(C);CHKERRQ(ierr);
793 
794   PetscFunctionReturn(0);
795 }
796 #endif /* PETSC_HAVE_MKL_SPARSE_SP2M */
797 
798 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
799 PetscErrorCode MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat*C)
800 {
801   Mat_SeqAIJMKL    *a, *b;
802   sparse_matrix_t  csrA, csrB, csrC;
803   PetscErrorCode   ierr;
804   sparse_status_t  stat = SPARSE_STATUS_SUCCESS;
805   PetscObjectState state;
806 
807   PetscFunctionBegin;
808   a = (Mat_SeqAIJMKL*)A->spptr;
809   b = (Mat_SeqAIJMKL*)B->spptr;
810   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
811   if (!a->sparse_optimized || a->state != state) {
812     MatSeqAIJMKL_create_mkl_handle(A);
813   }
814   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
815   if (!b->sparse_optimized || b->state != state) {
816     MatSeqAIJMKL_create_mkl_handle(B);
817   }
818   csrA = a->csrA;
819   csrB = b->csrA;
820 
821   stat = mkl_sparse_spmm(SPARSE_OPERATION_TRANSPOSE,csrA,csrB,&csrC);
822   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete sparse matrix-matrix multiply");
823 
824   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
825 
826   PetscFunctionReturn(0);
827 }
828 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
829 
830 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
831 PetscErrorCode MatPtAPNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat P,Mat C)
832 {
833   Mat_SeqAIJMKL       *a, *p, *c;
834   sparse_matrix_t     csrA, csrP, csrC;
835   PetscBool           set, flag;
836   sparse_status_t     stat = SPARSE_STATUS_SUCCESS;
837   struct matrix_descr descr_type_gen;
838   PetscObjectState    state;
839   PetscErrorCode      ierr;
840 
841   PetscFunctionBegin;
842   ierr = MatIsSymmetricKnown(A,&set,&flag);
843   if (!set || (set && !flag)) {
844     ierr = MatPtAPNumeric_SeqAIJ_SeqAIJ(A,P,C);CHKERRQ(ierr);
845     PetscFunctionReturn(0);
846   }
847 
848   a = (Mat_SeqAIJMKL*)A->spptr;
849   p = (Mat_SeqAIJMKL*)P->spptr;
850   c = (Mat_SeqAIJMKL*)C->spptr;
851   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
852   if (!a->sparse_optimized || a->state != state) {
853     MatSeqAIJMKL_create_mkl_handle(A);
854   }
855   ierr = PetscObjectStateGet((PetscObject)P,&state);CHKERRQ(ierr);
856   if (!p->sparse_optimized || p->state != state) {
857     MatSeqAIJMKL_create_mkl_handle(P);
858   }
859   csrA = a->csrA;
860   csrP = p->csrA;
861   csrC = c->csrA;
862   descr_type_gen.type = SPARSE_MATRIX_TYPE_GENERAL;
863 
864   /* TODO: Below won't work for complex matrix. Protect this! Maybe where function pointers are assigned in MatConvert? */
865   stat = mkl_sparse_sypr(SPARSE_OPERATION_TRANSPOSE,csrP,csrA,descr_type_gen,&csrC,SPARSE_STAGE_FINALIZE_MULT);
866   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to finalize mkl_sparse_sypr");
867 
868   /* Have to update the PETSc AIJ representation for matrix C from contents of MKL handle. */
869   ierr = MatSeqAIJMKL_update_from_mkl_handle(C);CHKERRQ(ierr);
870 
871   PetscFunctionReturn(0);
872 }
873 #endif
874 
875 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
876 PetscErrorCode MatPtAP_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
877 {
878   Mat_SeqAIJMKL       *a, *p;
879   sparse_matrix_t     csrA, csrP, csrC;
880   PetscBool           set, flag;
881   sparse_status_t     stat = SPARSE_STATUS_SUCCESS;
882   struct matrix_descr descr_type_gen;
883   PetscObjectState    state;
884   PetscErrorCode      ierr;
885 
886   PetscFunctionBegin;
887   ierr = MatIsSymmetricKnown(A,&set,&flag);
888   if (!set || (set && !flag)) {
889     ierr = MatPtAP_SeqAIJ_SeqAIJ(A,P,scall,fill,C);CHKERRQ(ierr);
890     PetscFunctionReturn(0);
891   }
892 
893   if (scall == MAT_REUSE_MATRIX) {
894     ierr = MatPtAPNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2(A,P,*C);CHKERRQ(ierr);
895     PetscFunctionReturn(0);
896   }
897 
898   a = (Mat_SeqAIJMKL*)A->spptr;
899   p = (Mat_SeqAIJMKL*)P->spptr;
900   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
901   if (!a->sparse_optimized || a->state != state) {
902     MatSeqAIJMKL_create_mkl_handle(A);
903   }
904   ierr = PetscObjectStateGet((PetscObject)P,&state);CHKERRQ(ierr);
905   if (!p->sparse_optimized || p->state != state) {
906     MatSeqAIJMKL_create_mkl_handle(P);
907   }
908   csrA = a->csrA;
909   csrP = p->csrA;
910   descr_type_gen.type = SPARSE_MATRIX_TYPE_GENERAL;
911 
912   /* TODO: Below won't work for complex matrix. Protect this! Maybe where function pointers are assigned in MatConvert? */
913   stat = mkl_sparse_sypr(SPARSE_OPERATION_TRANSPOSE,csrP,csrA,descr_type_gen,&csrC,SPARSE_STAGE_FULL_MULT);
914   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete full mkl_sparse_sypr");
915 
916   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
917   ierr = MatSetOption(*C,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
918 
919   PetscFunctionReturn(0);
920 }
921 #endif
922 
923 PetscErrorCode MatScale_SeqAIJMKL(Mat inA,PetscScalar alpha)
924 {
925   PetscErrorCode ierr;
926 
927   PetscFunctionBegin;
928   ierr = MatScale_SeqAIJ(inA,alpha);CHKERRQ(ierr);
929   ierr = MatSeqAIJMKL_create_mkl_handle(inA);CHKERRQ(ierr);
930   PetscFunctionReturn(0);
931 }
932 
933 PetscErrorCode MatDiagonalScale_SeqAIJMKL(Mat A,Vec ll,Vec rr)
934 {
935   PetscErrorCode ierr;
936 
937   PetscFunctionBegin;
938   ierr = MatDiagonalScale_SeqAIJ(A,ll,rr);CHKERRQ(ierr);
939   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
940   PetscFunctionReturn(0);
941 }
942 
943 PetscErrorCode MatDiagonalSet_SeqAIJMKL(Mat Y,Vec D,InsertMode is)
944 {
945   PetscErrorCode ierr;
946 
947   PetscFunctionBegin;
948   ierr = MatDiagonalSet_SeqAIJ(Y,D,is);CHKERRQ(ierr);
949   ierr = MatSeqAIJMKL_create_mkl_handle(Y);CHKERRQ(ierr);
950   PetscFunctionReturn(0);
951 }
952 
953 PetscErrorCode MatAXPY_SeqAIJMKL(Mat Y,PetscScalar a,Mat X,MatStructure str)
954 {
955   PetscErrorCode ierr;
956 
957   PetscFunctionBegin;
958   ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
959   if (str == SAME_NONZERO_PATTERN) {
960     /* MatAssemblyEnd() is not called if SAME_NONZERO_PATTERN, so we need to force update of the MKL matrix handle. */
961     ierr = MatSeqAIJMKL_create_mkl_handle(Y);CHKERRQ(ierr);
962   }
963   PetscFunctionReturn(0);
964 }
965 
966 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
967  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
968  * routine, but can also be used to convert an assembled SeqAIJ matrix
969  * into a SeqAIJMKL one. */
970 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
971 {
972   PetscErrorCode ierr;
973   Mat            B = *newmat;
974   Mat_SeqAIJMKL  *aijmkl;
975   PetscBool      set;
976   PetscBool      sametype;
977 
978   PetscFunctionBegin;
979   if (reuse == MAT_INITIAL_MATRIX) {
980     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
981   }
982 
983   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
984   if (sametype) PetscFunctionReturn(0);
985 
986   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
987   B->spptr = (void*) aijmkl;
988 
989   /* Set function pointers for methods that we inherit from AIJ but override.
990    * We also parse some command line options below, since those determine some of the methods we point to. */
991   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
992   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
993   B->ops->destroy          = MatDestroy_SeqAIJMKL;
994 
995   aijmkl->sparse_optimized = PETSC_FALSE;
996 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
997   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
998 #else
999   aijmkl->no_SpMV2 = PETSC_TRUE;
1000 #endif
1001   aijmkl->eager_inspection = PETSC_FALSE;
1002 
1003   /* Parse command line options. */
1004   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
1005   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
1006   ierr = PetscOptionsBool("-mat_aijmkl_eager_inspection","Eager Inspection","None",(PetscBool)aijmkl->eager_inspection,(PetscBool*)&aijmkl->eager_inspection,&set);CHKERRQ(ierr);
1007   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1008 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
1009   if(!aijmkl->no_SpMV2) {
1010     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
1011     aijmkl->no_SpMV2 = PETSC_TRUE;
1012   }
1013 #endif
1014 
1015   if(!aijmkl->no_SpMV2) {
1016 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
1017     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
1018     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2;
1019     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
1020     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2;
1021     B->ops->matmult          = MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2;
1022 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
1023     B->ops->matmultnumeric   = MatMatMultNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2;
1024 #ifndef PETSC_USE_COMPLEX
1025     B->ops->ptap             = MatPtAP_SeqAIJMKL_SeqAIJMKL_SpMV2;
1026     B->ops->ptapnumeric      = MatPtAPNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2;
1027 #endif
1028 #endif
1029     B->ops->transposematmult = MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2;
1030 #endif
1031   } else {
1032     B->ops->mult             = MatMult_SeqAIJMKL;
1033     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL;
1034     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
1035     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL;
1036   }
1037 
1038   B->ops->scale              = MatScale_SeqAIJMKL;
1039   B->ops->diagonalscale      = MatDiagonalScale_SeqAIJMKL;
1040   B->ops->diagonalset        = MatDiagonalSet_SeqAIJMKL;
1041   B->ops->axpy               = MatAXPY_SeqAIJMKL;
1042 
1043   ierr = PetscObjectComposeFunction((PetscObject)B,"MatScale_SeqAIJMKL_C",MatScale_SeqAIJMKL);CHKERRQ(ierr);
1044   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
1045   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
1046   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
1047   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
1048   if(!aijmkl->no_SpMV2) {
1049 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
1050     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqaijmkl_seqaijmkl_C",MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
1051 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
1052     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqaijmkl_seqaijmkl_C",MatMatMultNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
1053 #endif
1054     ierr = PetscObjectComposeFunction((PetscObject)B,"MatTransposeMatMult_seqaijmkl_seqaijmkl_C",MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
1055 #endif
1056   }
1057 
1058   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
1059   *newmat = B;
1060   PetscFunctionReturn(0);
1061 }
1062 
1063 /*@C
1064    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
1065    This type inherits from AIJ and is largely identical, but uses sparse BLAS
1066    routines from Intel MKL whenever possible.
1067    MatMult, MatMultAdd, MatMultTranspose, MatMultTransposeAdd, MatMatMult, and MatTransposeMatMult
1068    operations are currently supported.
1069    If the installed version of MKL supports the "SpMV2" sparse
1070    inspector-executor routines, then those are used by default.
1071 
1072    Collective on MPI_Comm
1073 
1074    Input Parameters:
1075 +  comm - MPI communicator, set to PETSC_COMM_SELF
1076 .  m - number of rows
1077 .  n - number of columns
1078 .  nz - number of nonzeros per row (same for all rows)
1079 -  nnz - array containing the number of nonzeros in the various rows
1080          (possibly different for each row) or NULL
1081 
1082    Output Parameter:
1083 .  A - the matrix
1084 
1085    Options Database Keys:
1086 +  -mat_aijmkl_no_spmv2 - disable use of the SpMV2 inspector-executor routines
1087 -  -mat_aijmkl_eager_inspection - perform MKL "inspection" phase upon matrix assembly; default is to do "lazy" inspection, performing this step the first time the matrix is applied
1088 
1089    Notes:
1090    If nnz is given then nz is ignored
1091 
1092    Level: intermediate
1093 
1094 .keywords: matrix, MKL, sparse, parallel
1095 
1096 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
1097 @*/
1098 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1099 {
1100   PetscErrorCode ierr;
1101 
1102   PetscFunctionBegin;
1103   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1104   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1105   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
1106   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
1107   PetscFunctionReturn(0);
1108 }
1109 
1110 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
1111 {
1112   PetscErrorCode ierr;
1113 
1114   PetscFunctionBegin;
1115   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
1116   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
1117   PetscFunctionReturn(0);
1118 }
1119