xref: /petsc/src/mat/impls/baij/seq/baijmkl/baijmkl.c (revision 2d30e087755efd99e28fdfe792ffbeb2ee1ea928)
1 /*
2   Defines basic operations for the MATSEQBAIJMKL matrix class.
3   Uses sparse BLAS operations from the Intel Math Kernel Library (MKL)
4   wherever possible. If used MKL verion is older than 11.3 PETSc default
5   code for sparse matrix operations is used.
6 */
7 
8 #include <../src/mat/impls/baij/seq/baij.h>
9 #include <../src/mat/impls/baij/seq/baijmkl/baijmkl.h>
10 #include <mkl_spblas.h>
11 
12 static PetscBool PetscSeqBAIJSupportsZeroBased(void) {
13   static PetscBool set = PETSC_FALSE, value;
14   int              n   = 1, ia[1], ja[1];
15   float            a[1];
16   sparse_status_t  status;
17   sparse_matrix_t  A;
18 
19   if (!set) {
20     status = mkl_sparse_s_create_bsr(&A, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_COLUMN_MAJOR, n, n, n, ia, ia, ja, a);
21     value  = (status != SPARSE_STATUS_NOT_SUPPORTED) ? PETSC_TRUE : PETSC_FALSE;
22     (void)mkl_sparse_destroy(A);
23     set = PETSC_TRUE;
24   }
25   return value;
26 }
27 
28 typedef struct {
29   PetscBool           sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
30   sparse_matrix_t     bsrA;             /* "Handle" used by SpMV2 inspector-executor routines. */
31   struct matrix_descr descr;
32   PetscInt           *ai1;
33   PetscInt           *aj1;
34 } Mat_SeqBAIJMKL;
35 
36 static PetscErrorCode MatAssemblyEnd_SeqBAIJMKL(Mat A, MatAssemblyType mode);
37 extern PetscErrorCode MatAssemblyEnd_SeqBAIJ(Mat, MatAssemblyType);
38 
39 PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJMKL_SeqBAIJ(Mat A, MatType type, MatReuse reuse, Mat *newmat) {
40   /* This routine is only called to convert a MATBAIJMKL to its base PETSc type, */
41   /* so we will ignore 'MatType type'. */
42   Mat             B       = *newmat;
43   Mat_SeqBAIJMKL *baijmkl = (Mat_SeqBAIJMKL *)A->spptr;
44 
45   PetscFunctionBegin;
46   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
47 
48   /* Reset the original function pointers. */
49   B->ops->duplicate        = MatDuplicate_SeqBAIJ;
50   B->ops->assemblyend      = MatAssemblyEnd_SeqBAIJ;
51   B->ops->destroy          = MatDestroy_SeqBAIJ;
52   B->ops->multtranspose    = MatMultTranspose_SeqBAIJ;
53   B->ops->multtransposeadd = MatMultTransposeAdd_SeqBAIJ;
54   B->ops->scale            = MatScale_SeqBAIJ;
55   B->ops->diagonalscale    = MatDiagonalScale_SeqBAIJ;
56   B->ops->axpy             = MatAXPY_SeqBAIJ;
57 
58   switch (A->rmap->bs) {
59   case 1:
60     B->ops->mult    = MatMult_SeqBAIJ_1;
61     B->ops->multadd = MatMultAdd_SeqBAIJ_1;
62     break;
63   case 2:
64     B->ops->mult    = MatMult_SeqBAIJ_2;
65     B->ops->multadd = MatMultAdd_SeqBAIJ_2;
66     break;
67   case 3:
68     B->ops->mult    = MatMult_SeqBAIJ_3;
69     B->ops->multadd = MatMultAdd_SeqBAIJ_3;
70     break;
71   case 4:
72     B->ops->mult    = MatMult_SeqBAIJ_4;
73     B->ops->multadd = MatMultAdd_SeqBAIJ_4;
74     break;
75   case 5:
76     B->ops->mult    = MatMult_SeqBAIJ_5;
77     B->ops->multadd = MatMultAdd_SeqBAIJ_5;
78     break;
79   case 6:
80     B->ops->mult    = MatMult_SeqBAIJ_6;
81     B->ops->multadd = MatMultAdd_SeqBAIJ_6;
82     break;
83   case 7:
84     B->ops->mult    = MatMult_SeqBAIJ_7;
85     B->ops->multadd = MatMultAdd_SeqBAIJ_7;
86     break;
87   case 11:
88     B->ops->mult    = MatMult_SeqBAIJ_11;
89     B->ops->multadd = MatMultAdd_SeqBAIJ_11;
90     break;
91   case 15:
92     B->ops->mult    = MatMult_SeqBAIJ_15_ver1;
93     B->ops->multadd = MatMultAdd_SeqBAIJ_N;
94     break;
95   default:
96     B->ops->mult    = MatMult_SeqBAIJ_N;
97     B->ops->multadd = MatMultAdd_SeqBAIJ_N;
98     break;
99   }
100   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaijmkl_seqbaij_C", NULL));
101 
102   /* Free everything in the Mat_SeqBAIJMKL data structure. Currently, this
103    * simply involves destroying the MKL sparse matrix handle and then freeing
104    * the spptr pointer. */
105   if (reuse == MAT_INITIAL_MATRIX) baijmkl = (Mat_SeqBAIJMKL *)B->spptr;
106 
107   if (baijmkl->sparse_optimized) PetscCallExternal(mkl_sparse_destroy, baijmkl->bsrA);
108   PetscCall(PetscFree2(baijmkl->ai1, baijmkl->aj1));
109   PetscCall(PetscFree(B->spptr));
110 
111   /* Change the type of B to MATSEQBAIJ. */
112   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQBAIJ));
113 
114   *newmat = B;
115   PetscFunctionReturn(0);
116 }
117 
118 static PetscErrorCode MatDestroy_SeqBAIJMKL(Mat A) {
119   Mat_SeqBAIJMKL *baijmkl = (Mat_SeqBAIJMKL *)A->spptr;
120 
121   PetscFunctionBegin;
122   if (baijmkl) {
123     /* Clean up everything in the Mat_SeqBAIJMKL data structure, then free A->spptr. */
124     if (baijmkl->sparse_optimized) PetscCallExternal(mkl_sparse_destroy, baijmkl->bsrA);
125     PetscCall(PetscFree2(baijmkl->ai1, baijmkl->aj1));
126     PetscCall(PetscFree(A->spptr));
127   }
128 
129   /* Change the type of A back to SEQBAIJ and use MatDestroy_SeqBAIJ()
130    * to destroy everything that remains. */
131   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQBAIJ));
132   PetscCall(MatDestroy_SeqBAIJ(A));
133   PetscFunctionReturn(0);
134 }
135 
136 static PetscErrorCode MatSeqBAIJMKL_create_mkl_handle(Mat A) {
137   Mat_SeqBAIJ    *a       = (Mat_SeqBAIJ *)A->data;
138   Mat_SeqBAIJMKL *baijmkl = (Mat_SeqBAIJMKL *)A->spptr;
139   PetscInt        mbs, nbs, nz, bs;
140   MatScalar      *aa;
141   PetscInt       *aj, *ai;
142   PetscInt        i;
143 
144   PetscFunctionBegin;
145   if (baijmkl->sparse_optimized) {
146     /* Matrix has been previously assembled and optimized. Must destroy old
147      * matrix handle before running the optimization step again. */
148     PetscCall(PetscFree2(baijmkl->ai1, baijmkl->aj1));
149     PetscCallMKL(mkl_sparse_destroy(baijmkl->bsrA));
150   }
151   baijmkl->sparse_optimized = PETSC_FALSE;
152 
153   /* Now perform the SpMV2 setup and matrix optimization. */
154   baijmkl->descr.type = SPARSE_MATRIX_TYPE_GENERAL;
155   baijmkl->descr.mode = SPARSE_FILL_MODE_LOWER;
156   baijmkl->descr.diag = SPARSE_DIAG_NON_UNIT;
157   mbs                 = a->mbs;
158   nbs                 = a->nbs;
159   nz                  = a->nz;
160   bs                  = A->rmap->bs;
161   aa                  = a->a;
162 
163   if ((nz != 0) & !(A->structure_only)) {
164     /* Create a new, optimized sparse matrix handle only if the matrix has nonzero entries.
165      * The MKL sparse-inspector executor routines don't like being passed an empty matrix. */
166     if (PetscSeqBAIJSupportsZeroBased()) {
167       aj = a->j;
168       ai = a->i;
169       PetscCallMKL(mkl_sparse_x_create_bsr(&(baijmkl->bsrA), SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_COLUMN_MAJOR, mbs, nbs, bs, ai, ai + 1, aj, aa));
170     } else {
171       PetscCall(PetscMalloc2(mbs + 1, &ai, nz, &aj));
172       for (i = 0; i < mbs + 1; i++) ai[i] = a->i[i] + 1;
173       for (i = 0; i < nz; i++) aj[i] = a->j[i] + 1;
174       aa = a->a;
175       PetscCallMKL(mkl_sparse_x_create_bsr(&baijmkl->bsrA, SPARSE_INDEX_BASE_ONE, SPARSE_LAYOUT_COLUMN_MAJOR, mbs, nbs, bs, ai, ai + 1, aj, aa));
176       baijmkl->ai1 = ai;
177       baijmkl->aj1 = aj;
178     }
179     PetscCallMKL(mkl_sparse_set_mv_hint(baijmkl->bsrA, SPARSE_OPERATION_NON_TRANSPOSE, baijmkl->descr, 1000));
180     PetscCallMKL(mkl_sparse_set_memory_hint(baijmkl->bsrA, SPARSE_MEMORY_AGGRESSIVE));
181     PetscCallMKL(mkl_sparse_optimize(baijmkl->bsrA));
182     baijmkl->sparse_optimized = PETSC_TRUE;
183   }
184   PetscFunctionReturn(0);
185 }
186 
187 static PetscErrorCode MatDuplicate_SeqBAIJMKL(Mat A, MatDuplicateOption op, Mat *M) {
188   Mat_SeqBAIJMKL *baijmkl;
189   Mat_SeqBAIJMKL *baijmkl_dest;
190 
191   PetscFunctionBegin;
192   PetscCall(MatDuplicate_SeqBAIJ(A, op, M));
193   baijmkl = (Mat_SeqBAIJMKL *)A->spptr;
194   PetscCall(PetscNewLog((*M), &baijmkl_dest));
195   (*M)->spptr = (void *)baijmkl_dest;
196   PetscCall(PetscMemcpy(baijmkl_dest, baijmkl, sizeof(Mat_SeqBAIJMKL)));
197   baijmkl_dest->sparse_optimized = PETSC_FALSE;
198   PetscCall(MatSeqBAIJMKL_create_mkl_handle(A));
199   PetscFunctionReturn(0);
200 }
201 
202 static PetscErrorCode MatMult_SeqBAIJMKL_SpMV2(Mat A, Vec xx, Vec yy) {
203   Mat_SeqBAIJ       *a       = (Mat_SeqBAIJ *)A->data;
204   Mat_SeqBAIJMKL    *baijmkl = (Mat_SeqBAIJMKL *)A->spptr;
205   const PetscScalar *x;
206   PetscScalar       *y;
207 
208   PetscFunctionBegin;
209   /* If there are no nonzero entries, zero yy and return immediately. */
210   if (!a->nz) {
211     PetscCall(VecSet(yy, 0.0));
212     PetscFunctionReturn(0);
213   }
214 
215   PetscCall(VecGetArrayRead(xx, &x));
216   PetscCall(VecGetArray(yy, &y));
217 
218   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
219    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
220    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
221   if (!baijmkl->sparse_optimized) PetscCall(MatSeqBAIJMKL_create_mkl_handle(A));
222 
223   /* Call MKL SpMV2 executor routine to do the MatMult. */
224   PetscCallMKL(mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE, 1.0, baijmkl->bsrA, baijmkl->descr, x, 0.0, y));
225 
226   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - a->nonzerorowcnt * A->rmap->bs));
227   PetscCall(VecRestoreArrayRead(xx, &x));
228   PetscCall(VecRestoreArray(yy, &y));
229   PetscFunctionReturn(0);
230 }
231 
232 static PetscErrorCode MatMultTranspose_SeqBAIJMKL_SpMV2(Mat A, Vec xx, Vec yy) {
233   Mat_SeqBAIJ       *a       = (Mat_SeqBAIJ *)A->data;
234   Mat_SeqBAIJMKL    *baijmkl = (Mat_SeqBAIJMKL *)A->spptr;
235   const PetscScalar *x;
236   PetscScalar       *y;
237 
238   PetscFunctionBegin;
239   /* If there are no nonzero entries, zero yy and return immediately. */
240   if (!a->nz) {
241     PetscCall(VecSet(yy, 0.0));
242     PetscFunctionReturn(0);
243   }
244 
245   PetscCall(VecGetArrayRead(xx, &x));
246   PetscCall(VecGetArray(yy, &y));
247 
248   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
249    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
250    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
251   if (!baijmkl->sparse_optimized) PetscCall(MatSeqBAIJMKL_create_mkl_handle(A));
252 
253   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
254   PetscCallMKL(mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE, 1.0, baijmkl->bsrA, baijmkl->descr, x, 0.0, y));
255 
256   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - a->nonzerorowcnt * A->rmap->bs));
257   PetscCall(VecRestoreArrayRead(xx, &x));
258   PetscCall(VecRestoreArray(yy, &y));
259   PetscFunctionReturn(0);
260 }
261 
262 static PetscErrorCode MatMultAdd_SeqBAIJMKL_SpMV2(Mat A, Vec xx, Vec yy, Vec zz) {
263   Mat_SeqBAIJ       *a       = (Mat_SeqBAIJ *)A->data;
264   Mat_SeqBAIJMKL    *baijmkl = (Mat_SeqBAIJMKL *)A->spptr;
265   const PetscScalar *x;
266   PetscScalar       *y, *z;
267   PetscInt           m = a->mbs * A->rmap->bs;
268   PetscInt           i;
269 
270   PetscFunctionBegin;
271   /* If there are no nonzero entries, set zz = yy and return immediately. */
272   if (!a->nz) {
273     PetscCall(VecCopy(yy, zz));
274     PetscFunctionReturn(0);
275   }
276 
277   PetscCall(VecGetArrayRead(xx, &x));
278   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
279 
280   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
281    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
282    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
283   if (!baijmkl->sparse_optimized) PetscCall(MatSeqBAIJMKL_create_mkl_handle(A));
284 
285   /* Call MKL sparse BLAS routine to do the MatMult. */
286   if (zz == yy) {
287     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
288      * with alpha and beta both set to 1.0. */
289     PetscCallMKL(mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE, 1.0, baijmkl->bsrA, baijmkl->descr, x, 1.0, z));
290   } else {
291     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
292      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
293     PetscCallMKL(mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE, 1.0, baijmkl->bsrA, baijmkl->descr, x, 0.0, z));
294     for (i = 0; i < m; i++) z[i] += y[i];
295   }
296 
297   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz));
298   PetscCall(VecRestoreArrayRead(xx, &x));
299   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
300   PetscFunctionReturn(0);
301 }
302 
303 static PetscErrorCode MatMultTransposeAdd_SeqBAIJMKL_SpMV2(Mat A, Vec xx, Vec yy, Vec zz) {
304   Mat_SeqBAIJ       *a       = (Mat_SeqBAIJ *)A->data;
305   Mat_SeqBAIJMKL    *baijmkl = (Mat_SeqBAIJMKL *)A->spptr;
306   const PetscScalar *x;
307   PetscScalar       *y, *z;
308   PetscInt           n = a->nbs * A->rmap->bs;
309   PetscInt           i;
310   /* Variables not in MatMultTransposeAdd_SeqBAIJ. */
311 
312   PetscFunctionBegin;
313   /* If there are no nonzero entries, set zz = yy and return immediately. */
314   if (!a->nz) {
315     PetscCall(VecCopy(yy, zz));
316     PetscFunctionReturn(0);
317   }
318 
319   PetscCall(VecGetArrayRead(xx, &x));
320   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
321 
322   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
323    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
324    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
325   if (!baijmkl->sparse_optimized) PetscCall(MatSeqBAIJMKL_create_mkl_handle(A));
326 
327   /* Call MKL sparse BLAS routine to do the MatMult. */
328   if (zz == yy) {
329     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
330      * with alpha and beta both set to 1.0. */
331     PetscCallMKL(mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE, 1.0, baijmkl->bsrA, baijmkl->descr, x, 1.0, z));
332   } else {
333     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
334      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
335     PetscCallMKL(mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE, 1.0, baijmkl->bsrA, baijmkl->descr, x, 0.0, z));
336     for (i = 0; i < n; i++) z[i] += y[i];
337   }
338 
339   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz));
340   PetscCall(VecRestoreArrayRead(xx, &x));
341   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
342   PetscFunctionReturn(0);
343 }
344 
345 static PetscErrorCode MatScale_SeqBAIJMKL(Mat inA, PetscScalar alpha) {
346   PetscFunctionBegin;
347   PetscCall(MatScale_SeqBAIJ(inA, alpha));
348   PetscCall(MatSeqBAIJMKL_create_mkl_handle(inA));
349   PetscFunctionReturn(0);
350 }
351 
352 static PetscErrorCode MatDiagonalScale_SeqBAIJMKL(Mat A, Vec ll, Vec rr) {
353   PetscFunctionBegin;
354   PetscCall(MatDiagonalScale_SeqBAIJ(A, ll, rr));
355   PetscCall(MatSeqBAIJMKL_create_mkl_handle(A));
356   PetscFunctionReturn(0);
357 }
358 
359 static PetscErrorCode MatAXPY_SeqBAIJMKL(Mat Y, PetscScalar a, Mat X, MatStructure str) {
360   PetscFunctionBegin;
361   PetscCall(MatAXPY_SeqBAIJ(Y, a, X, str));
362   if (str == SAME_NONZERO_PATTERN) {
363     /* MatAssemblyEnd() is not called if SAME_NONZERO_PATTERN, so we need to force update of the MKL matrix handle. */
364     PetscCall(MatSeqBAIJMKL_create_mkl_handle(Y));
365   }
366   PetscFunctionReturn(0);
367 }
368 /* MatConvert_SeqBAIJ_SeqBAIJMKL converts a SeqBAIJ matrix into a
369  * SeqBAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLBAIJ()
370  * routine, but can also be used to convert an assembled SeqBAIJ matrix
371  * into a SeqBAIJMKL one. */
372 PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqBAIJMKL(Mat A, MatType type, MatReuse reuse, Mat *newmat) {
373   Mat             B = *newmat;
374   Mat_SeqBAIJMKL *baijmkl;
375   PetscBool       sametype;
376 
377   PetscFunctionBegin;
378   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
379 
380   PetscCall(PetscObjectTypeCompare((PetscObject)A, type, &sametype));
381   if (sametype) PetscFunctionReturn(0);
382 
383   PetscCall(PetscNewLog(B, &baijmkl));
384   B->spptr = (void *)baijmkl;
385 
386   /* Set function pointers for methods that we inherit from BAIJ but override.
387    * We also parse some command line options below, since those determine some of the methods we point to. */
388   B->ops->assemblyend = MatAssemblyEnd_SeqBAIJMKL;
389 
390   baijmkl->sparse_optimized = PETSC_FALSE;
391 
392   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatScale_SeqBAIJMKL_C", MatScale_SeqBAIJMKL));
393   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaijmkl_seqbaij_C", MatConvert_SeqBAIJMKL_SeqBAIJ));
394 
395   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQBAIJMKL));
396   *newmat = B;
397   PetscFunctionReturn(0);
398 }
399 
400 static PetscErrorCode MatAssemblyEnd_SeqBAIJMKL(Mat A, MatAssemblyType mode) {
401   PetscFunctionBegin;
402   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
403   PetscCall(MatAssemblyEnd_SeqBAIJ(A, mode));
404   PetscCall(MatSeqBAIJMKL_create_mkl_handle(A));
405   A->ops->destroy          = MatDestroy_SeqBAIJMKL;
406   A->ops->mult             = MatMult_SeqBAIJMKL_SpMV2;
407   A->ops->multtranspose    = MatMultTranspose_SeqBAIJMKL_SpMV2;
408   A->ops->multadd          = MatMultAdd_SeqBAIJMKL_SpMV2;
409   A->ops->multtransposeadd = MatMultTransposeAdd_SeqBAIJMKL_SpMV2;
410   A->ops->scale            = MatScale_SeqBAIJMKL;
411   A->ops->diagonalscale    = MatDiagonalScale_SeqBAIJMKL;
412   A->ops->axpy             = MatAXPY_SeqBAIJMKL;
413   A->ops->duplicate        = MatDuplicate_SeqBAIJMKL;
414   PetscFunctionReturn(0);
415 }
416 
417 /*@C
418    MatCreateSeqBAIJMKL - Creates a sparse matrix of type `MATSEQBAIJMKL`.
419    This type inherits from `MATSEQBAIJ` and is largely identical, but uses sparse BLAS
420    routines from Intel MKL whenever possible.
421    `MatMult()`, `MatMultAdd()`, `MatMultTranspose()`, and `MatMultTransposeAdd()`
422    operations are currently supported.
423    If the installed version of MKL supports the "SpMV2" sparse
424    inspector-executor routines, then those are used by default.
425    Default PETSc kernels are used otherwise.
426 
427    Input Parameters:
428 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
429 .  bs - size of block, the blocks are ALWAYS square. One can use MatSetBlockSizes() to set a different row and column blocksize but the row
430           blocksize always defines the size of the blocks. The column blocksize sets the blocksize of the vectors obtained with MatCreateVecs()
431 .  m - number of rows
432 .  n - number of columns
433 .  nz - number of nonzero blocks  per block row (same for all rows)
434 -  nnz - array containing the number of nonzero blocks in the various block rows
435          (possibly different for each block row) or NULL
436 
437    Output Parameter:
438 .  A - the matrix
439 
440    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
441    MatXXXXSetPreallocation() paradigm instead of this routine directly.
442    [MatXXXXSetPreallocation() is, for example, `MatSeqBAIJSetPreallocation()`]
443 
444    Options Database Keys:
445 +   -mat_no_unroll - uses code that does not unroll the loops in the block calculations (much slower)
446 -   -mat_block_size - size of the blocks to use
447 
448    Level: intermediate
449 
450    Notes:
451    The number of rows and columns must be divisible by blocksize.
452 
453    If the nnz parameter is given then the nz parameter is ignored
454 
455    A nonzero block is any block that as 1 or more nonzeros in it
456 
457    The `MATSEQBAIJ` format is fully compatible with standard Fortran 77
458    storage.  That is, the stored row and column indices can begin at
459    either one (as in Fortran) or zero.  See the users' manual for details.
460 
461    Specify the preallocated storage with either nz or nnz (not both).
462    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
463    allocation.  See Users-Manual: ch_mat for details.
464    matrices.
465 
466 .seealso: `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateBAIJ()`
467 @*/
468 PetscErrorCode MatCreateSeqBAIJMKL(MPI_Comm comm, PetscInt bs, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) {
469   PetscFunctionBegin;
470   PetscCall(MatCreate(comm, A));
471   PetscCall(MatSetSizes(*A, m, n, m, n));
472   PetscCall(MatSetType(*A, MATSEQBAIJMKL));
473   PetscCall(MatSeqBAIJSetPreallocation_SeqBAIJ(*A, bs, nz, (PetscInt *)nnz));
474   PetscFunctionReturn(0);
475 }
476 
477 PETSC_EXTERN PetscErrorCode MatCreate_SeqBAIJMKL(Mat A) {
478   PetscFunctionBegin;
479   PetscCall(MatSetType(A, MATSEQBAIJ));
480   PetscCall(MatConvert_SeqBAIJ_SeqBAIJMKL(A, MATSEQBAIJMKL, MAT_INPLACE_MATRIX, &A));
481   PetscFunctionReturn(0);
482 }
483