xref: /petsc/src/mat/impls/baij/seq/baijmkl/baijmkl.c (revision 2475b7ca256cea2a4b7cbf2d8babcda14e5fa36e)
1 /*
2   Defines basic operations for the MATSEQBAIJMKL matrix class.
3   Uses sparse BLAS operations from the Intel Math Kernel Library (MKL)
4   wherever possible. If used MKL verion is older than 11.3 PETSc default
5   code for sparse matrix operations is used.
6 */
7 
8 #include <../src/mat/impls/baij/seq/baij.h>
9 #include <../src/mat/impls/baij/seq/baijmkl/baijmkl.h>
10 #include <mkl_spblas.h>
11 
12 static PetscBool PetscSeqBAIJSupportsZeroBased(void)
13 {
14   static PetscBool set = PETSC_FALSE,value;
15   int              n=1,ia[1],ja[1];
16   float            a[1];
17   sparse_status_t  status;
18   sparse_matrix_t  A;
19 
20   if (!set) {
21     status = mkl_sparse_s_create_bsr(&A,SPARSE_INDEX_BASE_ZERO,SPARSE_LAYOUT_COLUMN_MAJOR,n,n,n,ia,ia,ja,a);
22     value  = (status != SPARSE_STATUS_NOT_SUPPORTED) ? PETSC_TRUE : PETSC_FALSE;
23     (void)   mkl_sparse_destroy(A);
24     set    = PETSC_TRUE;
25   }
26   return value;
27 }
28 
29 typedef struct {
30   PetscBool           sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
31   sparse_matrix_t     bsrA; /* "Handle" used by SpMV2 inspector-executor routines. */
32   struct matrix_descr descr;
33   PetscInt            *ai1;
34   PetscInt            *aj1;
35 } Mat_SeqBAIJMKL;
36 
37 static PetscErrorCode MatAssemblyEnd_SeqBAIJMKL(Mat A, MatAssemblyType mode);
38 extern PetscErrorCode MatAssemblyEnd_SeqBAIJ(Mat,MatAssemblyType);
39 
40 PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJMKL_SeqBAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
41 {
42   /* This routine is only called to convert a MATBAIJMKL to its base PETSc type, */
43   /* so we will ignore 'MatType type'. */
44   PetscErrorCode ierr;
45   Mat            B        = *newmat;
46   Mat_SeqBAIJMKL *baijmkl = (Mat_SeqBAIJMKL*)A->spptr;
47 
48   PetscFunctionBegin;
49   if (reuse == MAT_INITIAL_MATRIX) {
50     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
51   }
52 
53   /* Reset the original function pointers. */
54   B->ops->duplicate        = MatDuplicate_SeqBAIJ;
55   B->ops->assemblyend      = MatAssemblyEnd_SeqBAIJ;
56   B->ops->destroy          = MatDestroy_SeqBAIJ;
57   B->ops->multtranspose    = MatMultTranspose_SeqBAIJ;
58   B->ops->multtransposeadd = MatMultTransposeAdd_SeqBAIJ;
59   B->ops->scale            = MatScale_SeqBAIJ;
60   B->ops->diagonalscale    = MatDiagonalScale_SeqBAIJ;
61   B->ops->axpy             = MatAXPY_SeqBAIJ;
62 
63   switch (A->rmap->bs) {
64     case 1:
65       B->ops->mult    = MatMult_SeqBAIJ_1;
66       B->ops->multadd = MatMultAdd_SeqBAIJ_1;
67       break;
68     case 2:
69       B->ops->mult    = MatMult_SeqBAIJ_2;
70       B->ops->multadd = MatMultAdd_SeqBAIJ_2;
71       break;
72     case 3:
73       B->ops->mult    = MatMult_SeqBAIJ_3;
74       B->ops->multadd = MatMultAdd_SeqBAIJ_3;
75       break;
76     case 4:
77       B->ops->mult    = MatMult_SeqBAIJ_4;
78       B->ops->multadd = MatMultAdd_SeqBAIJ_4;
79       break;
80     case 5:
81       B->ops->mult    = MatMult_SeqBAIJ_5;
82       B->ops->multadd = MatMultAdd_SeqBAIJ_5;
83       break;
84     case 6:
85       B->ops->mult    = MatMult_SeqBAIJ_6;
86       B->ops->multadd = MatMultAdd_SeqBAIJ_6;
87       break;
88     case 7:
89       B->ops->mult    = MatMult_SeqBAIJ_7;
90       B->ops->multadd = MatMultAdd_SeqBAIJ_7;
91       break;
92     case 11:
93       B->ops->mult    = MatMult_SeqBAIJ_11;
94       B->ops->multadd = MatMultAdd_SeqBAIJ_11;
95       break;
96     case 15:
97       B->ops->mult    = MatMult_SeqBAIJ_15_ver1;
98       B->ops->multadd = MatMultAdd_SeqBAIJ_N;
99       break;
100     default:
101       B->ops->mult    = MatMult_SeqBAIJ_N;
102       B->ops->multadd = MatMultAdd_SeqBAIJ_N;
103       break;
104   }
105   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqbaijmkl_seqbaij_C",NULL);CHKERRQ(ierr);
106 
107   /* Free everything in the Mat_SeqBAIJMKL data structure. Currently, this
108    * simply involves destroying the MKL sparse matrix handle and then freeing
109    * the spptr pointer. */
110   if (reuse == MAT_INITIAL_MATRIX) baijmkl = (Mat_SeqBAIJMKL*)B->spptr;
111 
112   if (baijmkl->sparse_optimized) {
113     sparse_status_t stat;
114     stat = mkl_sparse_destroy(baijmkl->bsrA);
115     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_destroy");
116   }
117   ierr = PetscFree2(baijmkl->ai1,baijmkl->aj1);CHKERRQ(ierr);
118   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
119 
120   /* Change the type of B to MATSEQBAIJ. */
121   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQBAIJ);CHKERRQ(ierr);
122 
123   *newmat = B;
124   PetscFunctionReturn(0);
125 }
126 
127 static PetscErrorCode MatDestroy_SeqBAIJMKL(Mat A)
128 {
129   PetscErrorCode ierr;
130   Mat_SeqBAIJMKL *baijmkl = (Mat_SeqBAIJMKL*) A->spptr;
131 
132   PetscFunctionBegin;
133   if (baijmkl) {
134     /* Clean up everything in the Mat_SeqBAIJMKL data structure, then free A->spptr. */
135     if (baijmkl->sparse_optimized) {
136       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
137       stat = mkl_sparse_destroy(baijmkl->bsrA);
138       if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_destroy");
139     }
140     ierr = PetscFree2(baijmkl->ai1,baijmkl->aj1);CHKERRQ(ierr);
141     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
142   }
143 
144   /* Change the type of A back to SEQBAIJ and use MatDestroy_SeqBAIJ()
145    * to destroy everything that remains. */
146   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQBAIJ);CHKERRQ(ierr);
147   ierr = MatDestroy_SeqBAIJ(A);CHKERRQ(ierr);
148   PetscFunctionReturn(0);
149 }
150 
151 static PetscErrorCode MatSeqBAIJMKL_create_mkl_handle(Mat A)
152 {
153   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
154   Mat_SeqBAIJMKL  *baijmkl = (Mat_SeqBAIJMKL*)A->spptr;
155   PetscInt        mbs, nbs, nz, bs;
156   MatScalar       *aa;
157   PetscInt        *aj,*ai;
158   sparse_status_t stat;
159   PetscErrorCode  ierr;
160   PetscInt        i;
161 
162   PetscFunctionBegin;
163   if (baijmkl->sparse_optimized) {
164     /* Matrix has been previously assembled and optimized. Must destroy old
165      * matrix handle before running the optimization step again. */
166     ierr = PetscFree2(baijmkl->ai1,baijmkl->aj1);CHKERRQ(ierr);
167     stat = mkl_sparse_destroy(baijmkl->bsrA);CHKERRMKL(stat);
168   }
169   baijmkl->sparse_optimized = PETSC_FALSE;
170 
171   /* Now perform the SpMV2 setup and matrix optimization. */
172   baijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
173   baijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
174   baijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
175   mbs = a->mbs;
176   nbs = a->nbs;
177   nz  = a->nz;
178   bs  = A->rmap->bs;
179   aa  = a->a;
180 
181   if ((nz!=0) & !(A->structure_only)) {
182     /* Create a new, optimized sparse matrix handle only if the matrix has nonzero entries.
183      * The MKL sparse-inspector executor routines don't like being passed an empty matrix. */
184     if (PetscSeqBAIJSupportsZeroBased()) {
185       aj   = a->j;
186       ai   = a->i;
187       stat = mkl_sparse_x_create_bsr(&(baijmkl->bsrA),SPARSE_INDEX_BASE_ZERO,SPARSE_LAYOUT_COLUMN_MAJOR,mbs,nbs,bs,ai,ai+1,aj,aa);CHKERRMKL(stat);
188     } else {
189       ierr = PetscMalloc2(mbs+1,&ai,nz,&aj);CHKERRQ(ierr);
190       for (i=0;i<mbs+1;i++) ai[i] = a->i[i]+1;
191       for (i=0;i<nz;i++) aj[i] = a->j[i]+1;
192       aa   = a->a;
193       stat = mkl_sparse_x_create_bsr(&baijmkl->bsrA,SPARSE_INDEX_BASE_ONE,SPARSE_LAYOUT_COLUMN_MAJOR,mbs,nbs,bs,ai,ai+1,aj,aa);CHKERRMKL(stat);
194       baijmkl->ai1 = ai;
195       baijmkl->aj1 = aj;
196     }
197     stat = mkl_sparse_set_mv_hint(baijmkl->bsrA,SPARSE_OPERATION_NON_TRANSPOSE,baijmkl->descr,1000);CHKERRMKL(stat);
198     stat = mkl_sparse_set_memory_hint(baijmkl->bsrA,SPARSE_MEMORY_AGGRESSIVE);CHKERRMKL(stat);
199     stat = mkl_sparse_optimize(baijmkl->bsrA);CHKERRMKL(stat);
200     baijmkl->sparse_optimized = PETSC_TRUE;
201   }
202   PetscFunctionReturn(0);
203 }
204 
205 static PetscErrorCode MatDuplicate_SeqBAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
206 {
207   PetscErrorCode ierr;
208   Mat_SeqBAIJMKL *baijmkl;
209   Mat_SeqBAIJMKL *baijmkl_dest;
210 
211   PetscFunctionBegin;
212   ierr = MatDuplicate_SeqBAIJ(A,op,M);CHKERRQ(ierr);
213   baijmkl = (Mat_SeqBAIJMKL*) A->spptr;
214   ierr = PetscNewLog((*M),&baijmkl_dest);CHKERRQ(ierr);
215   (*M)->spptr = (void*)baijmkl_dest;
216   ierr = PetscMemcpy(baijmkl_dest,baijmkl,sizeof(Mat_SeqBAIJMKL));CHKERRQ(ierr);
217   baijmkl_dest->sparse_optimized = PETSC_FALSE;
218   ierr = MatSeqBAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
219   PetscFunctionReturn(0);
220 }
221 
222 static PetscErrorCode MatMult_SeqBAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
223 {
224   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
225   Mat_SeqBAIJMKL     *baijmkl=(Mat_SeqBAIJMKL*)A->spptr;
226   const PetscScalar  *x;
227   PetscScalar        *y;
228   PetscErrorCode     ierr;
229   sparse_status_t    stat = SPARSE_STATUS_SUCCESS;
230 
231   PetscFunctionBegin;
232   /* If there are no nonzero entries, zero yy and return immediately. */
233   if (!a->nz) {
234     ierr = VecSet(yy,0.0);CHKERRQ(ierr);
235     PetscFunctionReturn(0);
236   }
237 
238   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
239   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
240 
241   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
242    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
243    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
244   if (!baijmkl->sparse_optimized) {
245     ierr = MatSeqBAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
246   }
247 
248   /* Call MKL SpMV2 executor routine to do the MatMult. */
249   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,baijmkl->bsrA,baijmkl->descr,x,0.0,y);CHKERRMKL(stat);
250 
251   ierr = PetscLogFlops(2.0*a->bs2*a->nz - a->nonzerorowcnt*A->rmap->bs);CHKERRQ(ierr);
252   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
253   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
254   PetscFunctionReturn(0);
255 }
256 
257 static PetscErrorCode MatMultTranspose_SeqBAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
258 {
259   Mat_SeqBAIJ       *a       = (Mat_SeqBAIJ*)A->data;
260   Mat_SeqBAIJMKL    *baijmkl = (Mat_SeqBAIJMKL*)A->spptr;
261   const PetscScalar *x;
262   PetscScalar       *y;
263   PetscErrorCode    ierr;
264   sparse_status_t   stat;
265 
266   PetscFunctionBegin;
267   /* If there are no nonzero entries, zero yy and return immediately. */
268   if(!a->nz) {
269     ierr = VecSet(yy,0.0);CHKERRQ(ierr);
270     PetscFunctionReturn(0);
271   }
272 
273   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
274   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
275 
276   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
277    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
278    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
279   if (!baijmkl->sparse_optimized) {
280     ierr = MatSeqBAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
281   }
282 
283   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
284   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,baijmkl->bsrA,baijmkl->descr,x,0.0,y);CHKERRMKL(stat);
285 
286   ierr = PetscLogFlops(2.0*a->bs2*a->nz - a->nonzerorowcnt*A->rmap->bs);CHKERRQ(ierr);
287   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
288   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
289   PetscFunctionReturn(0);
290 }
291 
292 static PetscErrorCode MatMultAdd_SeqBAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
293 {
294   Mat_SeqBAIJ        *a       = (Mat_SeqBAIJ*)A->data;
295   Mat_SeqBAIJMKL     *baijmkl = (Mat_SeqBAIJMKL*)A->spptr;
296   const PetscScalar  *x;
297   PetscScalar        *y,*z;
298   PetscErrorCode     ierr;
299   PetscInt           m=a->mbs*A->rmap->bs;
300   PetscInt           i;
301 
302   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
303 
304   PetscFunctionBegin;
305   /* If there are no nonzero entries, set zz = yy and return immediately. */
306   if (!a->nz) {
307     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
308     PetscFunctionReturn(0);
309   }
310 
311   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
312   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
313 
314   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
315    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
316    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
317   if (!baijmkl->sparse_optimized) {
318     ierr = MatSeqBAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
319   }
320 
321   /* Call MKL sparse BLAS routine to do the MatMult. */
322   if (zz == yy) {
323     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
324      * with alpha and beta both set to 1.0. */
325     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,baijmkl->bsrA,baijmkl->descr,x,1.0,z);CHKERRMKL(stat);
326   } else {
327     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
328      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
329     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,baijmkl->bsrA,baijmkl->descr,x,0.0,z);CHKERRMKL(stat);
330     for (i=0; i<m; i++) {
331       z[i] += y[i];
332     }
333   }
334 
335   ierr = PetscLogFlops(2.0*a->bs2*a->nz);CHKERRQ(ierr);
336   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
337   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
338   PetscFunctionReturn(0);
339 }
340 
341 static PetscErrorCode MatMultTransposeAdd_SeqBAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
342 {
343   Mat_SeqBAIJ       *a       = (Mat_SeqBAIJ*)A->data;
344   Mat_SeqBAIJMKL    *baijmkl = (Mat_SeqBAIJMKL*)A->spptr;
345   const PetscScalar *x;
346   PetscScalar       *y,*z;
347   PetscErrorCode    ierr;
348   PetscInt          n=a->nbs*A->rmap->bs;
349   PetscInt          i;
350   /* Variables not in MatMultTransposeAdd_SeqBAIJ. */
351   sparse_status_t   stat = SPARSE_STATUS_SUCCESS;
352 
353   PetscFunctionBegin;
354   /* If there are no nonzero entries, set zz = yy and return immediately. */
355   if(!a->nz) {
356     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
357     PetscFunctionReturn(0);
358   }
359 
360   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
361   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
362 
363   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
364    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
365    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
366   if (!baijmkl->sparse_optimized) {
367     ierr = MatSeqBAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
368   }
369 
370   /* Call MKL sparse BLAS routine to do the MatMult. */
371   if (zz == yy) {
372     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
373      * with alpha and beta both set to 1.0. */
374     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,baijmkl->bsrA,baijmkl->descr,x,1.0,z);CHKERRMKL(stat);
375   } else {
376     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
377      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
378     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,baijmkl->bsrA,baijmkl->descr,x,0.0,z);CHKERRMKL(stat);
379     for (i=0; i<n; i++) {
380       z[i] += y[i];
381     }
382   }
383 
384   ierr = PetscLogFlops(2.0*a->bs2*a->nz);CHKERRQ(ierr);
385   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
386   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
387   PetscFunctionReturn(0);
388 }
389 
390 static PetscErrorCode MatScale_SeqBAIJMKL(Mat inA,PetscScalar alpha)
391 {
392   PetscErrorCode ierr;
393 
394   PetscFunctionBegin;
395   ierr = MatScale_SeqBAIJ(inA,alpha);CHKERRQ(ierr);
396   ierr = MatSeqBAIJMKL_create_mkl_handle(inA);CHKERRQ(ierr);
397   PetscFunctionReturn(0);
398 }
399 
400 static PetscErrorCode MatDiagonalScale_SeqBAIJMKL(Mat A,Vec ll,Vec rr)
401 {
402   PetscErrorCode ierr;
403 
404   PetscFunctionBegin;
405   ierr = MatDiagonalScale_SeqBAIJ(A,ll,rr);CHKERRQ(ierr);
406   ierr = MatSeqBAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
407   PetscFunctionReturn(0);
408 }
409 
410 static PetscErrorCode MatAXPY_SeqBAIJMKL(Mat Y,PetscScalar a,Mat X,MatStructure str)
411 {
412   PetscErrorCode ierr;
413 
414   PetscFunctionBegin;
415   ierr = MatAXPY_SeqBAIJ(Y,a,X,str);CHKERRQ(ierr);
416   if (str == SAME_NONZERO_PATTERN) {
417     /* MatAssemblyEnd() is not called if SAME_NONZERO_PATTERN, so we need to force update of the MKL matrix handle. */
418     ierr = MatSeqBAIJMKL_create_mkl_handle(Y);CHKERRQ(ierr);
419   }
420   PetscFunctionReturn(0);
421 }
422 /* MatConvert_SeqBAIJ_SeqBAIJMKL converts a SeqBAIJ matrix into a
423  * SeqBAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLBAIJ()
424  * routine, but can also be used to convert an assembled SeqBAIJ matrix
425  * into a SeqBAIJMKL one. */
426 PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqBAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
427 {
428   PetscErrorCode ierr;
429   Mat            B = *newmat;
430   Mat_SeqBAIJMKL *baijmkl;
431   PetscBool      sametype;
432 
433   PetscFunctionBegin;
434   if (reuse == MAT_INITIAL_MATRIX) {
435     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
436   }
437 
438   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
439   if (sametype) PetscFunctionReturn(0);
440 
441   ierr     = PetscNewLog(B,&baijmkl);CHKERRQ(ierr);
442   B->spptr = (void*)baijmkl;
443 
444   /* Set function pointers for methods that we inherit from BAIJ but override.
445    * We also parse some command line options below, since those determine some of the methods we point to. */
446   B->ops->assemblyend      = MatAssemblyEnd_SeqBAIJMKL;
447 
448   baijmkl->sparse_optimized = PETSC_FALSE;
449 
450   ierr = PetscObjectComposeFunction((PetscObject)B,"MatScale_SeqBAIJMKL_C",MatScale_SeqBAIJMKL);CHKERRQ(ierr);
451   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqbaijmkl_seqbaij_C",MatConvert_SeqBAIJMKL_SeqBAIJ);CHKERRQ(ierr);
452 
453   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQBAIJMKL);CHKERRQ(ierr);
454   *newmat = B;
455   PetscFunctionReturn(0);
456 }
457 
458 static PetscErrorCode MatAssemblyEnd_SeqBAIJMKL(Mat A, MatAssemblyType mode)
459 {
460   PetscErrorCode  ierr;
461 
462   PetscFunctionBegin;
463   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
464   ierr = MatAssemblyEnd_SeqBAIJ(A, mode);CHKERRQ(ierr);
465   ierr = MatSeqBAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
466   A->ops->destroy          = MatDestroy_SeqBAIJMKL;
467   A->ops->mult             = MatMult_SeqBAIJMKL_SpMV2;
468   A->ops->multtranspose    = MatMultTranspose_SeqBAIJMKL_SpMV2;
469   A->ops->multadd          = MatMultAdd_SeqBAIJMKL_SpMV2;
470   A->ops->multtransposeadd = MatMultTransposeAdd_SeqBAIJMKL_SpMV2;
471   A->ops->scale            = MatScale_SeqBAIJMKL;
472   A->ops->diagonalscale    = MatDiagonalScale_SeqBAIJMKL;
473   A->ops->axpy             = MatAXPY_SeqBAIJMKL;
474   A->ops->duplicate        = MatDuplicate_SeqBAIJMKL;
475   PetscFunctionReturn(0);
476 }
477 
478 /*@C
479    MatCreateSeqBAIJMKL - Creates a sparse matrix of type SEQBAIJMKL.
480    This type inherits from BAIJ and is largely identical, but uses sparse BLAS
481    routines from Intel MKL whenever possible.
482    MatMult, MatMultAdd, MatMultTranspose, and MatMultTransposeAdd
483    operations are currently supported.
484    If the installed version of MKL supports the "SpMV2" sparse
485    inspector-executor routines, then those are used by default.
486    Default PETSc kernels are used otherwise.
487 
488    Input Parameters:
489 +  comm - MPI communicator, set to PETSC_COMM_SELF
490 .  bs - size of block, the blocks are ALWAYS square. One can use MatSetBlockSizes() to set a different row and column blocksize but the row
491           blocksize always defines the size of the blocks. The column blocksize sets the blocksize of the vectors obtained with MatCreateVecs()
492 .  m - number of rows
493 .  n - number of columns
494 .  nz - number of nonzero blocks  per block row (same for all rows)
495 -  nnz - array containing the number of nonzero blocks in the various block rows
496          (possibly different for each block row) or NULL
497 
498 
499    Output Parameter:
500 .  A - the matrix
501 
502    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
503    MatXXXXSetPreallocation() paradigm instead of this routine directly.
504    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
505 
506    Options Database Keys:
507 .   -mat_no_unroll - uses code that does not unroll the loops in the
508                      block calculations (much slower)
509 .    -mat_block_size - size of the blocks to use
510 
511    Level: intermediate
512 
513    Notes:
514    The number of rows and columns must be divisible by blocksize.
515 
516    If the nnz parameter is given then the nz parameter is ignored
517 
518    A nonzero block is any block that as 1 or more nonzeros in it
519 
520    The block AIJ format is fully compatible with standard Fortran 77
521    storage.  That is, the stored row and column indices can begin at
522    either one (as in Fortran) or zero.  See the users' manual for details.
523 
524    Specify the preallocated storage with either nz or nnz (not both).
525    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
526    allocation.  See Users-Manual: ch_mat for details.
527    matrices.
528 
529 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues(), MatCreateBAIJ()
530 
531 @*/
532 PetscErrorCode  MatCreateSeqBAIJMKL(MPI_Comm comm,PetscInt bs,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
533 {
534   PetscErrorCode ierr;
535 
536   PetscFunctionBegin;
537   ierr = MatCreate(comm,A);CHKERRQ(ierr);
538   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
539   ierr = MatSetType(*A,MATSEQBAIJMKL);CHKERRQ(ierr);
540   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(*A,bs,nz,(PetscInt*)nnz);CHKERRQ(ierr);
541   PetscFunctionReturn(0);
542 }
543 
544 PETSC_EXTERN PetscErrorCode MatCreate_SeqBAIJMKL(Mat A)
545 {
546   PetscErrorCode ierr;
547 
548   PetscFunctionBegin;
549   ierr = MatSetType(A,MATSEQBAIJ);CHKERRQ(ierr);
550   ierr = MatConvert_SeqBAIJ_SeqBAIJMKL(A,MATSEQBAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
551   PetscFunctionReturn(0);
552 }
553