xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision bbadc9ee47b9852201359aaea290cac33ff02705)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <../src/mat/impls/dense/seq/dense.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8 #include <immintrin.h>
9 #endif
10 
11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12 {
13   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
14   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
15   const PetscInt *idx;
16   PetscInt       start,end,*ai,*aj,bs,*nidx2;
17   PetscBT        table;
18 
19   PetscFunctionBegin;
20   m  = a->mbs;
21   ai = a->i;
22   aj = a->j;
23   bs = A->rmap->bs;
24 
25   PetscCheck(ov >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26 
27   PetscCall(PetscBTCreate(m,&table));
28   PetscCall(PetscMalloc1(m+1,&nidx));
29   PetscCall(PetscMalloc1(A->rmap->N+1,&nidx2));
30 
31   for (i=0; i<is_max; i++) {
32     /* Initialise the two local arrays */
33     isz  = 0;
34     PetscCall(PetscBTMemzero(m,table));
35 
36     /* Extract the indices, assume there can be duplicate entries */
37     PetscCall(ISGetIndices(is[i],&idx));
38     PetscCall(ISGetLocalSize(is[i],&n));
39 
40     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
41     for (j=0; j<n; ++j) {
42       ival = idx[j]/bs; /* convert the indices into block indices */
43       PetscCheck(ival<m,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
44       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
45     }
46     PetscCall(ISRestoreIndices(is[i],&idx));
47     PetscCall(ISDestroy(&is[i]));
48 
49     k = 0;
50     for (j=0; j<ov; j++) { /* for each overlap*/
51       n = isz;
52       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
53         row   = nidx[k];
54         start = ai[row];
55         end   = ai[row+1];
56         for (l = start; l<end; l++) {
57           val = aj[l];
58           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
59         }
60       }
61     }
62     /* expand the Index Set */
63     for (j=0; j<isz; j++) {
64       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
65     }
66     PetscCall(ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i));
67   }
68   PetscCall(PetscBTDestroy(&table));
69   PetscCall(PetscFree(nidx));
70   PetscCall(PetscFree(nidx2));
71   PetscFunctionReturn(0);
72 }
73 
74 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
75 {
76   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
77   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
78   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
79   const PetscInt *irow,*icol;
80   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
81   PetscInt       *aj = a->j,*ai = a->i;
82   MatScalar      *mat_a;
83   Mat            C;
84   PetscBool      flag;
85 
86   PetscFunctionBegin;
87   PetscCall(ISGetIndices(isrow,&irow));
88   PetscCall(ISGetIndices(iscol,&icol));
89   PetscCall(ISGetLocalSize(isrow,&nrows));
90   PetscCall(ISGetLocalSize(iscol,&ncols));
91 
92   PetscCall(PetscCalloc1(1+oldcols,&smap));
93   ssmap = smap;
94   PetscCall(PetscMalloc1(1+nrows,&lens));
95   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
96   /* determine lens of each row */
97   for (i=0; i<nrows; i++) {
98     kstart  = ai[irow[i]];
99     kend    = kstart + a->ilen[irow[i]];
100     lens[i] = 0;
101     for (k=kstart; k<kend; k++) {
102       if (ssmap[aj[k]]) lens[i]++;
103     }
104   }
105   /* Create and fill new matrix */
106   if (scall == MAT_REUSE_MATRIX) {
107     c = (Mat_SeqBAIJ*)((*B)->data);
108 
109     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
110     PetscCall(PetscArraycmp(c->ilen,lens,c->mbs,&flag));
111     PetscCheck(flag,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
112     PetscCall(PetscArrayzero(c->ilen,c->mbs));
113     C    = *B;
114   } else {
115     PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&C));
116     PetscCall(MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE));
117     PetscCall(MatSetType(C,((PetscObject)A)->type_name));
118     PetscCall(MatSeqBAIJSetPreallocation(C,bs,0,lens));
119   }
120   c = (Mat_SeqBAIJ*)(C->data);
121   for (i=0; i<nrows; i++) {
122     row      = irow[i];
123     kstart   = ai[row];
124     kend     = kstart + a->ilen[row];
125     mat_i    = c->i[i];
126     mat_j    = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */
127     mat_a    = c->a ? c->a + mat_i*bs2 : NULL; /* mustn't add to NULL, that is UB */
128     mat_ilen = c->ilen + i;
129     for (k=kstart; k<kend; k++) {
130       if ((tcol=ssmap[a->j[k]])) {
131         *mat_j++ = tcol - 1;
132         PetscCall(PetscArraycpy(mat_a,a->a+k*bs2,bs2));
133         mat_a   += bs2;
134         (*mat_ilen)++;
135       }
136     }
137   }
138   /* sort */
139   if (c->j && c->a) {
140     MatScalar *work;
141     PetscCall(PetscMalloc1(bs2,&work));
142     for (i=0; i<nrows; i++) {
143       PetscInt ilen;
144       mat_i = c->i[i];
145       mat_j = c->j + mat_i;
146       mat_a = c->a + mat_i*bs2;
147       ilen  = c->ilen[i];
148       PetscCall(PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work));
149     }
150     PetscCall(PetscFree(work));
151   }
152 
153   /* Free work space */
154   PetscCall(ISRestoreIndices(iscol,&icol));
155   PetscCall(PetscFree(smap));
156   PetscCall(PetscFree(lens));
157   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
158   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
159 
160   PetscCall(ISRestoreIndices(isrow,&irow));
161   *B   = C;
162   PetscFunctionReturn(0);
163 }
164 
165 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
166 {
167   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
168   IS             is1,is2;
169   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
170   const PetscInt *irow,*icol;
171 
172   PetscFunctionBegin;
173   PetscCall(ISGetIndices(isrow,&irow));
174   PetscCall(ISGetIndices(iscol,&icol));
175   PetscCall(ISGetLocalSize(isrow,&nrows));
176   PetscCall(ISGetLocalSize(iscol,&ncols));
177 
178   /* Verify if the indices corespond to each element in a block
179    and form the IS with compressed IS */
180   maxmnbs = PetscMax(a->mbs,a->nbs);
181   PetscCall(PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary));
182   PetscCall(PetscArrayzero(vary,a->mbs));
183   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
184   for (i=0; i<a->mbs; i++) {
185     PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
186   }
187   count = 0;
188   for (i=0; i<nrows; i++) {
189     j = irow[i] / bs;
190     if ((vary[j]--)==bs) iary[count++] = j;
191   }
192   PetscCall(ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1));
193 
194   PetscCall(PetscArrayzero(vary,a->nbs));
195   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
196   for (i=0; i<a->nbs; i++) {
197     PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
198   }
199   count = 0;
200   for (i=0; i<ncols; i++) {
201     j = icol[i] / bs;
202     if ((vary[j]--)==bs) iary[count++] = j;
203   }
204   PetscCall(ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2));
205   PetscCall(ISRestoreIndices(isrow,&irow));
206   PetscCall(ISRestoreIndices(iscol,&icol));
207   PetscCall(PetscFree2(vary,iary));
208 
209   PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B));
210   PetscCall(ISDestroy(&is1));
211   PetscCall(ISDestroy(&is2));
212   PetscFunctionReturn(0);
213 }
214 
215 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
216 {
217   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data;
218   Mat_SubSppt    *submatj = c->submatis1;
219 
220   PetscFunctionBegin;
221   PetscCall((*submatj->destroy)(C));
222   PetscCall(MatDestroySubMatrix_Private(submatj));
223   PetscFunctionReturn(0);
224 }
225 
226 /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
227 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
228 {
229   PetscInt       i;
230   Mat            C;
231   Mat_SeqBAIJ    *c;
232   Mat_SubSppt    *submatj;
233 
234   PetscFunctionBegin;
235   for (i=0; i<n; i++) {
236     C       = (*mat)[i];
237     c       = (Mat_SeqBAIJ*)C->data;
238     submatj = c->submatis1;
239     if (submatj) {
240       if (--((PetscObject)C)->refct <= 0) {
241         PetscCall((*submatj->destroy)(C));
242         PetscCall(MatDestroySubMatrix_Private(submatj));
243         PetscCall(PetscFree(C->defaultvectype));
244         PetscCall(PetscLayoutDestroy(&C->rmap));
245         PetscCall(PetscLayoutDestroy(&C->cmap));
246         PetscCall(PetscHeaderDestroy(&C));
247       }
248     } else {
249       PetscCall(MatDestroy(&C));
250     }
251   }
252 
253   /* Destroy Dummy submatrices created for reuse */
254   PetscCall(MatDestroySubMatrices_Dummy(n,mat));
255 
256   PetscCall(PetscFree(*mat));
257   PetscFunctionReturn(0);
258 }
259 
260 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
261 {
262   PetscInt       i;
263 
264   PetscFunctionBegin;
265   if (scall == MAT_INITIAL_MATRIX) {
266     PetscCall(PetscCalloc1(n+1,B));
267   }
268 
269   for (i=0; i<n; i++) {
270     PetscCall(MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]));
271   }
272   PetscFunctionReturn(0);
273 }
274 
275 /* -------------------------------------------------------*/
276 /* Should check that shapes of vectors and matrices match */
277 /* -------------------------------------------------------*/
278 
279 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
280 {
281   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
282   PetscScalar       *z,sum;
283   const PetscScalar *x;
284   const MatScalar   *v;
285   PetscInt          mbs,i,n;
286   const PetscInt    *idx,*ii,*ridx=NULL;
287   PetscBool         usecprow=a->compressedrow.use;
288 
289   PetscFunctionBegin;
290   PetscCall(VecGetArrayRead(xx,&x));
291   PetscCall(VecGetArrayWrite(zz,&z));
292 
293   if (usecprow) {
294     mbs  = a->compressedrow.nrows;
295     ii   = a->compressedrow.i;
296     ridx = a->compressedrow.rindex;
297     PetscCall(PetscArrayzero(z,a->mbs));
298   } else {
299     mbs = a->mbs;
300     ii  = a->i;
301   }
302 
303   for (i=0; i<mbs; i++) {
304     n   = ii[1] - ii[0];
305     v   = a->a + ii[0];
306     idx = a->j + ii[0];
307     ii++;
308     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
309     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
310     sum = 0.0;
311     PetscSparseDensePlusDot(sum,x,v,idx,n);
312     if (usecprow) {
313       z[ridx[i]] = sum;
314     } else {
315       z[i]       = sum;
316     }
317   }
318   PetscCall(VecRestoreArrayRead(xx,&x));
319   PetscCall(VecRestoreArrayWrite(zz,&z));
320   PetscCall(PetscLogFlops(2.0*a->nz - a->nonzerorowcnt));
321   PetscFunctionReturn(0);
322 }
323 
324 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
325 {
326   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
327   PetscScalar       *z = NULL,sum1,sum2,*zarray;
328   const PetscScalar *x,*xb;
329   PetscScalar       x1,x2;
330   const MatScalar   *v;
331   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
332   PetscBool         usecprow=a->compressedrow.use;
333 
334   PetscFunctionBegin;
335   PetscCall(VecGetArrayRead(xx,&x));
336   PetscCall(VecGetArrayWrite(zz,&zarray));
337 
338   idx = a->j;
339   v   = a->a;
340   if (usecprow) {
341     mbs  = a->compressedrow.nrows;
342     ii   = a->compressedrow.i;
343     ridx = a->compressedrow.rindex;
344     PetscCall(PetscArrayzero(zarray,2*a->mbs));
345   } else {
346     mbs = a->mbs;
347     ii  = a->i;
348     z   = zarray;
349   }
350 
351   for (i=0; i<mbs; i++) {
352     n           = ii[1] - ii[0]; ii++;
353     sum1        = 0.0; sum2 = 0.0;
354     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
355     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
356     for (j=0; j<n; j++) {
357       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
358       sum1 += v[0]*x1 + v[2]*x2;
359       sum2 += v[1]*x1 + v[3]*x2;
360       v    += 4;
361     }
362     if (usecprow) z = zarray + 2*ridx[i];
363     z[0] = sum1; z[1] = sum2;
364     if (!usecprow) z += 2;
365   }
366   PetscCall(VecRestoreArrayRead(xx,&x));
367   PetscCall(VecRestoreArrayWrite(zz,&zarray));
368   PetscCall(PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt));
369   PetscFunctionReturn(0);
370 }
371 
372 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
373 {
374   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
375   PetscScalar       *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray;
376   const PetscScalar *x,*xb;
377   const MatScalar   *v;
378   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
379   PetscBool         usecprow=a->compressedrow.use;
380 
381 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
382 #pragma disjoint(*v,*z,*xb)
383 #endif
384 
385   PetscFunctionBegin;
386   PetscCall(VecGetArrayRead(xx,&x));
387   PetscCall(VecGetArrayWrite(zz,&zarray));
388 
389   idx = a->j;
390   v   = a->a;
391   if (usecprow) {
392     mbs  = a->compressedrow.nrows;
393     ii   = a->compressedrow.i;
394     ridx = a->compressedrow.rindex;
395     PetscCall(PetscArrayzero(zarray,3*a->mbs));
396   } else {
397     mbs = a->mbs;
398     ii  = a->i;
399     z   = zarray;
400   }
401 
402   for (i=0; i<mbs; i++) {
403     n           = ii[1] - ii[0]; ii++;
404     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
405     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
406     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
407     for (j=0; j<n; j++) {
408       xb = x + 3*(*idx++);
409       x1 = xb[0];
410       x2 = xb[1];
411       x3 = xb[2];
412 
413       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
414       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
415       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
416       v    += 9;
417     }
418     if (usecprow) z = zarray + 3*ridx[i];
419     z[0] = sum1; z[1] = sum2; z[2] = sum3;
420     if (!usecprow) z += 3;
421   }
422   PetscCall(VecRestoreArrayRead(xx,&x));
423   PetscCall(VecRestoreArrayWrite(zz,&zarray));
424   PetscCall(PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt));
425   PetscFunctionReturn(0);
426 }
427 
428 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
429 {
430   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
431   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
432   const PetscScalar *x,*xb;
433   const MatScalar   *v;
434   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
435   PetscBool         usecprow=a->compressedrow.use;
436 
437   PetscFunctionBegin;
438   PetscCall(VecGetArrayRead(xx,&x));
439   PetscCall(VecGetArrayWrite(zz,&zarray));
440 
441   idx = a->j;
442   v   = a->a;
443   if (usecprow) {
444     mbs  = a->compressedrow.nrows;
445     ii   = a->compressedrow.i;
446     ridx = a->compressedrow.rindex;
447     PetscCall(PetscArrayzero(zarray,4*a->mbs));
448   } else {
449     mbs = a->mbs;
450     ii  = a->i;
451     z   = zarray;
452   }
453 
454   for (i=0; i<mbs; i++) {
455     n = ii[1] - ii[0];
456     ii++;
457     sum1 = 0.0;
458     sum2 = 0.0;
459     sum3 = 0.0;
460     sum4 = 0.0;
461 
462     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
463     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
464     for (j=0; j<n; j++) {
465       xb    = x + 4*(*idx++);
466       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
467       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
468       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
469       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
470       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
471       v    += 16;
472     }
473     if (usecprow) z = zarray + 4*ridx[i];
474     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
475     if (!usecprow) z += 4;
476   }
477   PetscCall(VecRestoreArrayRead(xx,&x));
478   PetscCall(VecRestoreArrayWrite(zz,&zarray));
479   PetscCall(PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt));
480   PetscFunctionReturn(0);
481 }
482 
483 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
484 {
485   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
486   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray;
487   const PetscScalar *xb,*x;
488   const MatScalar   *v;
489   const PetscInt    *idx,*ii,*ridx=NULL;
490   PetscInt          mbs,i,j,n;
491   PetscBool         usecprow=a->compressedrow.use;
492 
493   PetscFunctionBegin;
494   PetscCall(VecGetArrayRead(xx,&x));
495   PetscCall(VecGetArrayWrite(zz,&zarray));
496 
497   idx = a->j;
498   v   = a->a;
499   if (usecprow) {
500     mbs  = a->compressedrow.nrows;
501     ii   = a->compressedrow.i;
502     ridx = a->compressedrow.rindex;
503     PetscCall(PetscArrayzero(zarray,5*a->mbs));
504   } else {
505     mbs = a->mbs;
506     ii  = a->i;
507     z   = zarray;
508   }
509 
510   for (i=0; i<mbs; i++) {
511     n           = ii[1] - ii[0]; ii++;
512     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
513     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
514     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
515     for (j=0; j<n; j++) {
516       xb    = x + 5*(*idx++);
517       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
518       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
519       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
520       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
521       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
522       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
523       v    += 25;
524     }
525     if (usecprow) z = zarray + 5*ridx[i];
526     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
527     if (!usecprow) z += 5;
528   }
529   PetscCall(VecRestoreArrayRead(xx,&x));
530   PetscCall(VecRestoreArrayWrite(zz,&zarray));
531   PetscCall(PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt));
532   PetscFunctionReturn(0);
533 }
534 
535 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
536 {
537   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
538   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
539   const PetscScalar *x,*xb;
540   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
541   const MatScalar   *v;
542   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
543   PetscBool         usecprow=a->compressedrow.use;
544 
545   PetscFunctionBegin;
546   PetscCall(VecGetArrayRead(xx,&x));
547   PetscCall(VecGetArrayWrite(zz,&zarray));
548 
549   idx = a->j;
550   v   = a->a;
551   if (usecprow) {
552     mbs  = a->compressedrow.nrows;
553     ii   = a->compressedrow.i;
554     ridx = a->compressedrow.rindex;
555     PetscCall(PetscArrayzero(zarray,6*a->mbs));
556   } else {
557     mbs = a->mbs;
558     ii  = a->i;
559     z   = zarray;
560   }
561 
562   for (i=0; i<mbs; i++) {
563     n  = ii[1] - ii[0];
564     ii++;
565     sum1 = 0.0;
566     sum2 = 0.0;
567     sum3 = 0.0;
568     sum4 = 0.0;
569     sum5 = 0.0;
570     sum6 = 0.0;
571 
572     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
573     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
574     for (j=0; j<n; j++) {
575       xb    = x + 6*(*idx++);
576       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
577       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
578       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
579       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
580       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
581       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
582       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
583       v    += 36;
584     }
585     if (usecprow) z = zarray + 6*ridx[i];
586     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
587     if (!usecprow) z += 6;
588   }
589 
590   PetscCall(VecRestoreArrayRead(xx,&x));
591   PetscCall(VecRestoreArrayWrite(zz,&zarray));
592   PetscCall(PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt));
593   PetscFunctionReturn(0);
594 }
595 
596 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
597 {
598   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
599   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
600   const PetscScalar *x,*xb;
601   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
602   const MatScalar   *v;
603   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
604   PetscBool         usecprow=a->compressedrow.use;
605 
606   PetscFunctionBegin;
607   PetscCall(VecGetArrayRead(xx,&x));
608   PetscCall(VecGetArrayWrite(zz,&zarray));
609 
610   idx = a->j;
611   v   = a->a;
612   if (usecprow) {
613     mbs  = a->compressedrow.nrows;
614     ii   = a->compressedrow.i;
615     ridx = a->compressedrow.rindex;
616     PetscCall(PetscArrayzero(zarray,7*a->mbs));
617   } else {
618     mbs = a->mbs;
619     ii  = a->i;
620     z   = zarray;
621   }
622 
623   for (i=0; i<mbs; i++) {
624     n  = ii[1] - ii[0];
625     ii++;
626     sum1 = 0.0;
627     sum2 = 0.0;
628     sum3 = 0.0;
629     sum4 = 0.0;
630     sum5 = 0.0;
631     sum6 = 0.0;
632     sum7 = 0.0;
633 
634     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
635     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
636     for (j=0; j<n; j++) {
637       xb    = x + 7*(*idx++);
638       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
639       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
640       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
641       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
642       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
643       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
644       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
645       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
646       v    += 49;
647     }
648     if (usecprow) z = zarray + 7*ridx[i];
649     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
650     if (!usecprow) z += 7;
651   }
652 
653   PetscCall(VecRestoreArrayRead(xx,&x));
654   PetscCall(VecRestoreArrayWrite(zz,&zarray));
655   PetscCall(PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt));
656   PetscFunctionReturn(0);
657 }
658 
659 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
660 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
661 {
662   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
663   PetscScalar       *z = NULL,*work,*workt,*zarray;
664   const PetscScalar *x,*xb;
665   const MatScalar   *v;
666   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
667   const PetscInt    *idx,*ii,*ridx=NULL;
668   PetscInt          k;
669   PetscBool         usecprow=a->compressedrow.use;
670 
671   __m256d a0,a1,a2,a3,a4,a5;
672   __m256d w0,w1,w2,w3;
673   __m256d z0,z1,z2;
674   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
675 
676   PetscFunctionBegin;
677   PetscCall(VecGetArrayRead(xx,&x));
678   PetscCall(VecGetArrayWrite(zz,&zarray));
679 
680   idx = a->j;
681   v   = a->a;
682   if (usecprow) {
683     mbs  = a->compressedrow.nrows;
684     ii   = a->compressedrow.i;
685     ridx = a->compressedrow.rindex;
686     PetscCall(PetscArrayzero(zarray,bs*a->mbs));
687   } else {
688     mbs = a->mbs;
689     ii  = a->i;
690     z   = zarray;
691   }
692 
693   if (!a->mult_work) {
694     k    = PetscMax(A->rmap->n,A->cmap->n);
695     PetscCall(PetscMalloc1(k+1,&a->mult_work));
696   }
697 
698   work = a->mult_work;
699   for (i=0; i<mbs; i++) {
700     n           = ii[1] - ii[0]; ii++;
701     workt       = work;
702     for (j=0; j<n; j++) {
703       xb = x + bs*(*idx++);
704       for (k=0; k<bs; k++) workt[k] = xb[k];
705       workt += bs;
706     }
707     if (usecprow) z = zarray + bs*ridx[i];
708 
709     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
710 
711     for (j=0; j<n; j++) {
712       /* first column of a */
713       w0 = _mm256_set1_pd(work[j*9  ]);
714       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
715       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
716       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
717 
718       /* second column of a */
719       w1 = _mm256_set1_pd(work[j*9+ 1]);
720       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
721       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
722       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
723 
724       /* third column of a */
725       w2 = _mm256_set1_pd(work[j*9 +2]);
726       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
727       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
728       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
729 
730       /* fourth column of a */
731       w3 = _mm256_set1_pd(work[j*9+ 3]);
732       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
733       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
734       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
735 
736       /* fifth column of a */
737       w0 = _mm256_set1_pd(work[j*9+ 4]);
738       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
739       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
740       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
741 
742       /* sixth column of a */
743       w1 = _mm256_set1_pd(work[j*9+ 5]);
744       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
745       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
746       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
747 
748       /* seventh column of a */
749       w2 = _mm256_set1_pd(work[j*9+ 6]);
750       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
751       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
752       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
753 
754       /* eighth column of a */
755       w3 = _mm256_set1_pd(work[j*9+ 7]);
756       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
757       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
758       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
759 
760       /* ninth column of a */
761       w0 = _mm256_set1_pd(work[j*9+ 8]);
762       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
763       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
764       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
765     }
766 
767     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
768 
769     v += n*bs2;
770     if (!usecprow) z += bs;
771   }
772   PetscCall(VecRestoreArrayRead(xx,&x));
773   PetscCall(VecRestoreArrayWrite(zz,&zarray));
774   PetscCall(PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt));
775   PetscFunctionReturn(0);
776 }
777 #endif
778 
779 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
780 {
781   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
782   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
783   const PetscScalar *x,*xb;
784   PetscScalar       *zarray,xv;
785   const MatScalar   *v;
786   const PetscInt    *ii,*ij=a->j,*idx;
787   PetscInt          mbs,i,j,k,n,*ridx=NULL;
788   PetscBool         usecprow=a->compressedrow.use;
789 
790   PetscFunctionBegin;
791   PetscCall(VecGetArrayRead(xx,&x));
792   PetscCall(VecGetArrayWrite(zz,&zarray));
793 
794   v = a->a;
795   if (usecprow) {
796     mbs  = a->compressedrow.nrows;
797     ii   = a->compressedrow.i;
798     ridx = a->compressedrow.rindex;
799     PetscCall(PetscArrayzero(zarray,11*a->mbs));
800   } else {
801     mbs = a->mbs;
802     ii  = a->i;
803     z   = zarray;
804   }
805 
806   for (i=0; i<mbs; i++) {
807     n    = ii[i+1] - ii[i];
808     idx  = ij + ii[i];
809     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
810     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
811 
812     for (j=0; j<n; j++) {
813       xb = x + 11*(idx[j]);
814 
815       for (k=0; k<11; k++) {
816         xv     =  xb[k];
817         sum1  += v[0]*xv;
818         sum2  += v[1]*xv;
819         sum3  += v[2]*xv;
820         sum4  += v[3]*xv;
821         sum5  += v[4]*xv;
822         sum6  += v[5]*xv;
823         sum7  += v[6]*xv;
824         sum8  += v[7]*xv;
825         sum9  += v[8]*xv;
826         sum10 += v[9]*xv;
827         sum11 += v[10]*xv;
828         v     += 11;
829       }
830     }
831     if (usecprow) z = zarray + 11*ridx[i];
832     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
833     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
834 
835     if (!usecprow) z += 11;
836   }
837 
838   PetscCall(VecRestoreArrayRead(xx,&x));
839   PetscCall(VecRestoreArrayWrite(zz,&zarray));
840   PetscCall(PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt));
841   PetscFunctionReturn(0);
842 }
843 
844 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
845 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz)
846 {
847   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
848   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
849   const PetscScalar *x,*xb;
850   PetscScalar       *zarray,xv;
851   const MatScalar   *v;
852   const PetscInt    *ii,*ij=a->j,*idx;
853   PetscInt          mbs,i,j,k,n,*ridx=NULL;
854   PetscBool         usecprow=a->compressedrow.use;
855 
856   PetscFunctionBegin;
857   PetscCall(VecGetArrayRead(xx,&x));
858   PetscCall(VecGetArrayWrite(zz,&zarray));
859 
860   v = a->a;
861   if (usecprow) {
862     mbs  = a->compressedrow.nrows;
863     ii   = a->compressedrow.i;
864     ridx = a->compressedrow.rindex;
865     PetscCall(PetscArrayzero(zarray,12*a->mbs));
866   } else {
867     mbs = a->mbs;
868     ii  = a->i;
869     z   = zarray;
870   }
871 
872   for (i=0; i<mbs; i++) {
873     n    = ii[i+1] - ii[i];
874     idx  = ij + ii[i];
875     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
876     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0;
877 
878     for (j=0; j<n; j++) {
879       xb = x + 12*(idx[j]);
880 
881       for (k=0; k<12; k++) {
882         xv     =  xb[k];
883         sum1  += v[0]*xv;
884         sum2  += v[1]*xv;
885         sum3  += v[2]*xv;
886         sum4  += v[3]*xv;
887         sum5  += v[4]*xv;
888         sum6  += v[5]*xv;
889         sum7  += v[6]*xv;
890         sum8  += v[7]*xv;
891         sum9  += v[8]*xv;
892         sum10 += v[9]*xv;
893         sum11 += v[10]*xv;
894         sum12 += v[11]*xv;
895         v     += 12;
896       }
897     }
898     if (usecprow) z = zarray + 12*ridx[i];
899     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
900     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
901     if (!usecprow) z += 12;
902   }
903   PetscCall(VecRestoreArrayRead(xx,&x));
904   PetscCall(VecRestoreArrayWrite(zz,&zarray));
905   PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt));
906   PetscFunctionReturn(0);
907 }
908 
909 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz)
910 {
911   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
912   PetscScalar       *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
913   const PetscScalar *x,*xb;
914   PetscScalar       *zarray,*yarray,xv;
915   const MatScalar   *v;
916   const PetscInt    *ii,*ij=a->j,*idx;
917   PetscInt          mbs = a->mbs,i,j,k,n,*ridx=NULL;
918   PetscBool         usecprow=a->compressedrow.use;
919 
920   PetscFunctionBegin;
921   PetscCall(VecGetArrayRead(xx,&x));
922   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
923 
924   v = a->a;
925   if (usecprow) {
926    if (zz != yy) {
927      PetscCall(PetscArraycpy(zarray,yarray,12*mbs));
928     }
929     mbs  = a->compressedrow.nrows;
930     ii   = a->compressedrow.i;
931     ridx = a->compressedrow.rindex;
932   } else {
933     ii  = a->i;
934     y   = yarray;
935     z   = zarray;
936   }
937 
938   for (i=0; i<mbs; i++) {
939     n    = ii[i+1] - ii[i];
940     idx  = ij + ii[i];
941 
942     if (usecprow) {
943       y = yarray + 12*ridx[i];
944       z = zarray + 12*ridx[i];
945     }
946     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];  sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
947     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];
948 
949     for (j=0; j<n; j++) {
950       xb = x + 12*(idx[j]);
951 
952       for (k=0; k<12; k++) {
953         xv     =  xb[k];
954         sum1  += v[0]*xv;
955         sum2  += v[1]*xv;
956         sum3  += v[2]*xv;
957         sum4  += v[3]*xv;
958         sum5  += v[4]*xv;
959         sum6  += v[5]*xv;
960         sum7  += v[6]*xv;
961         sum8  += v[7]*xv;
962         sum9  += v[8]*xv;
963         sum10 += v[9]*xv;
964         sum11 += v[10]*xv;
965         sum12 += v[11]*xv;
966         v     += 12;
967       }
968     }
969 
970     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
971     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
972     if (!usecprow) {
973       y += 12;
974       z += 12;
975     }
976   }
977   PetscCall(VecRestoreArrayRead(xx,&x));
978   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
979   PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt));
980   PetscFunctionReturn(0);
981 }
982 
983 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
984 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz)
985 {
986   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
987   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
988   const PetscScalar *x,*xb;
989   PetscScalar       x1,x2,x3,x4,*zarray;
990   const MatScalar   *v;
991   const PetscInt    *ii,*ij=a->j,*idx,*ridx=NULL;
992   PetscInt          mbs,i,j,n;
993   PetscBool         usecprow=a->compressedrow.use;
994 
995   PetscFunctionBegin;
996   PetscCall(VecGetArrayRead(xx,&x));
997   PetscCall(VecGetArrayWrite(zz,&zarray));
998 
999   v = a->a;
1000   if (usecprow) {
1001     mbs  = a->compressedrow.nrows;
1002     ii   = a->compressedrow.i;
1003     ridx = a->compressedrow.rindex;
1004     PetscCall(PetscArrayzero(zarray,12*a->mbs));
1005   } else {
1006     mbs = a->mbs;
1007     ii  = a->i;
1008     z   = zarray;
1009   }
1010 
1011   for (i=0; i<mbs; i++) {
1012     n    = ii[i+1] - ii[i];
1013     idx  = ij + ii[i];
1014 
1015     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1016     for (j=0; j<n; j++) {
1017       xb = x + 12*(idx[j]);
1018       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1019 
1020       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1021       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1022       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1023       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1024       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1025       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1026       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1027       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1028       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1029       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1030       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1031       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1032       v += 48;
1033 
1034       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1035 
1036       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1037       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1038       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1039       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1040       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1041       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1042       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1043       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1044       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1045       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1046       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1047       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1048       v     += 48;
1049 
1050       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1051       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1052       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1053       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1054       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1055       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1056       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1057       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1058       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1059       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1060       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1061       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1062       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1063       v     += 48;
1064 
1065     }
1066     if (usecprow) z = zarray + 12*ridx[i];
1067     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1068     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1069     if (!usecprow) z += 12;
1070   }
1071   PetscCall(VecRestoreArrayRead(xx,&x));
1072   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1073   PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt));
1074   PetscFunctionReturn(0);
1075 }
1076 
1077 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1078 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz)
1079 {
1080   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1081   PetscScalar       *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
1082   const PetscScalar *x,*xb;
1083   PetscScalar       x1,x2,x3,x4,*zarray,*yarray;
1084   const MatScalar   *v;
1085   const PetscInt    *ii,*ij=a->j,*idx,*ridx=NULL;
1086   PetscInt          mbs = a->mbs,i,j,n;
1087   PetscBool         usecprow=a->compressedrow.use;
1088 
1089   PetscFunctionBegin;
1090   PetscCall(VecGetArrayRead(xx,&x));
1091   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
1092 
1093   v = a->a;
1094   if (usecprow) {
1095     if (zz != yy) {
1096       PetscCall(PetscArraycpy(zarray,yarray,12*mbs));
1097     }
1098     mbs  = a->compressedrow.nrows;
1099     ii   = a->compressedrow.i;
1100     ridx = a->compressedrow.rindex;
1101   } else {
1102     ii  = a->i;
1103     y   = yarray;
1104     z   = zarray;
1105   }
1106 
1107   for (i=0; i<mbs; i++) {
1108     n    = ii[i+1] - ii[i];
1109     idx  = ij + ii[i];
1110 
1111     if (usecprow) {
1112       y = yarray + 12*ridx[i];
1113       z = zarray + 12*ridx[i];
1114     }
1115     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];  sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1116     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];
1117 
1118     for (j=0; j<n; j++) {
1119       xb = x + 12*(idx[j]);
1120       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1121 
1122       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1123       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1124       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1125       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1126       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1127       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1128       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1129       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1130       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1131       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1132       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1133       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1134       v += 48;
1135 
1136       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1137 
1138       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1139       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1140       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1141       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1142       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1143       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1144       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1145       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1146       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1147       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1148       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1149       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1150       v     += 48;
1151 
1152       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1153       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1154       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1155       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1156       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1157       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1158       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1159       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1160       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1161       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1162       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1163       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1164       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1165       v     += 48;
1166 
1167     }
1168     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1169     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1170     if (!usecprow) {
1171       y += 12;
1172       z += 12;
1173     }
1174   }
1175   PetscCall(VecRestoreArrayRead(xx,&x));
1176   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
1177   PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt));
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1182 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz)
1183 {
1184   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1185   PetscScalar       *z = NULL,*zarray;
1186   const PetscScalar *x,*work;
1187   const MatScalar   *v = a->a;
1188   PetscInt          mbs,i,j,n;
1189   const PetscInt    *idx = a->j,*ii,*ridx=NULL;
1190   PetscBool         usecprow=a->compressedrow.use;
1191   const PetscInt    bs = 12, bs2 = 144;
1192 
1193   __m256d a0,a1,a2,a3,a4,a5;
1194   __m256d w0,w1,w2,w3;
1195   __m256d z0,z1,z2;
1196 
1197   PetscFunctionBegin;
1198   PetscCall(VecGetArrayRead(xx,&x));
1199   PetscCall(VecGetArrayWrite(zz,&zarray));
1200 
1201   if (usecprow) {
1202     mbs  = a->compressedrow.nrows;
1203     ii   = a->compressedrow.i;
1204     ridx = a->compressedrow.rindex;
1205     PetscCall(PetscArrayzero(zarray,bs*a->mbs));
1206   } else {
1207     mbs = a->mbs;
1208     ii  = a->i;
1209     z   = zarray;
1210   }
1211 
1212   for (i=0; i<mbs; i++) {
1213     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
1214 
1215     n  = ii[1] - ii[0]; ii++;
1216     for (j=0; j<n; j++) {
1217       work = x + bs*(*idx++);
1218 
1219       /* first column of a */
1220       w0 = _mm256_set1_pd(work[0]);
1221       a0 = _mm256_loadu_pd(v+0); z0 = _mm256_fmadd_pd(a0,w0,z0);
1222       a1 = _mm256_loadu_pd(v+4); z1 = _mm256_fmadd_pd(a1,w0,z1);
1223       a2 = _mm256_loadu_pd(v+8); z2 = _mm256_fmadd_pd(a2,w0,z2);
1224 
1225       /* second column of a */
1226       w1 = _mm256_set1_pd(work[1]);
1227       a3 = _mm256_loadu_pd(v+12); z0 = _mm256_fmadd_pd(a3,w1,z0);
1228       a4 = _mm256_loadu_pd(v+16); z1 = _mm256_fmadd_pd(a4,w1,z1);
1229       a5 = _mm256_loadu_pd(v+20); z2 = _mm256_fmadd_pd(a5,w1,z2);
1230 
1231       /* third column of a */
1232       w2 = _mm256_set1_pd(work[2]);
1233       a0 = _mm256_loadu_pd(v+24); z0 = _mm256_fmadd_pd(a0,w2,z0);
1234       a1 = _mm256_loadu_pd(v+28); z1 = _mm256_fmadd_pd(a1,w2,z1);
1235       a2 = _mm256_loadu_pd(v+32); z2 = _mm256_fmadd_pd(a2,w2,z2);
1236 
1237       /* fourth column of a */
1238       w3 = _mm256_set1_pd(work[3]);
1239       a3 = _mm256_loadu_pd(v+36); z0 = _mm256_fmadd_pd(a3,w3,z0);
1240       a4 = _mm256_loadu_pd(v+40); z1 = _mm256_fmadd_pd(a4,w3,z1);
1241       a5 = _mm256_loadu_pd(v+44); z2 = _mm256_fmadd_pd(a5,w3,z2);
1242 
1243       /* fifth column of a */
1244       w0 = _mm256_set1_pd(work[4]);
1245       a0 = _mm256_loadu_pd(v+48); z0 = _mm256_fmadd_pd(a0,w0,z0);
1246       a1 = _mm256_loadu_pd(v+52); z1 = _mm256_fmadd_pd(a1,w0,z1);
1247       a2 = _mm256_loadu_pd(v+56); z2 = _mm256_fmadd_pd(a2,w0,z2);
1248 
1249       /* sixth column of a */
1250       w1 = _mm256_set1_pd(work[5]);
1251       a3 = _mm256_loadu_pd(v+60); z0 = _mm256_fmadd_pd(a3,w1,z0);
1252       a4 = _mm256_loadu_pd(v+64); z1 = _mm256_fmadd_pd(a4,w1,z1);
1253       a5 = _mm256_loadu_pd(v+68); z2 = _mm256_fmadd_pd(a5,w1,z2);
1254 
1255       /* seventh column of a */
1256       w2 = _mm256_set1_pd(work[6]);
1257       a0 = _mm256_loadu_pd(v+72); z0 = _mm256_fmadd_pd(a0,w2,z0);
1258       a1 = _mm256_loadu_pd(v+76); z1 = _mm256_fmadd_pd(a1,w2,z1);
1259       a2 = _mm256_loadu_pd(v+80); z2 = _mm256_fmadd_pd(a2,w2,z2);
1260 
1261       /* eighth column of a */
1262       w3 = _mm256_set1_pd(work[7]);
1263       a3 = _mm256_loadu_pd(v+84); z0 = _mm256_fmadd_pd(a3,w3,z0);
1264       a4 = _mm256_loadu_pd(v+88); z1 = _mm256_fmadd_pd(a4,w3,z1);
1265       a5 = _mm256_loadu_pd(v+92); z2 = _mm256_fmadd_pd(a5,w3,z2);
1266 
1267       /* ninth column of a */
1268       w0 = _mm256_set1_pd(work[8]);
1269       a0 = _mm256_loadu_pd(v+96); z0 = _mm256_fmadd_pd(a0,w0,z0);
1270       a1 = _mm256_loadu_pd(v+100); z1 = _mm256_fmadd_pd(a1,w0,z1);
1271       a2 = _mm256_loadu_pd(v+104); z2 = _mm256_fmadd_pd(a2,w0,z2);
1272 
1273       /* tenth column of a */
1274       w1 = _mm256_set1_pd(work[9]);
1275       a3 = _mm256_loadu_pd(v+108); z0 = _mm256_fmadd_pd(a3,w1,z0);
1276       a4 = _mm256_loadu_pd(v+112); z1 = _mm256_fmadd_pd(a4,w1,z1);
1277       a5 = _mm256_loadu_pd(v+116); z2 = _mm256_fmadd_pd(a5,w1,z2);
1278 
1279       /* eleventh column of a */
1280       w2 = _mm256_set1_pd(work[10]);
1281       a0 = _mm256_loadu_pd(v+120); z0 = _mm256_fmadd_pd(a0,w2,z0);
1282       a1 = _mm256_loadu_pd(v+124); z1 = _mm256_fmadd_pd(a1,w2,z1);
1283       a2 = _mm256_loadu_pd(v+128); z2 = _mm256_fmadd_pd(a2,w2,z2);
1284 
1285       /* twelveth column of a */
1286       w3 = _mm256_set1_pd(work[11]);
1287       a3 = _mm256_loadu_pd(v+132); z0 = _mm256_fmadd_pd(a3,w3,z0);
1288       a4 = _mm256_loadu_pd(v+136); z1 = _mm256_fmadd_pd(a4,w3,z1);
1289       a5 = _mm256_loadu_pd(v+140); z2 = _mm256_fmadd_pd(a5,w3,z2);
1290 
1291       v += bs2;
1292     }
1293     if (usecprow) z = zarray + bs*ridx[i];
1294     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_storeu_pd(&z[ 8], z2);
1295     if (!usecprow) z += bs;
1296   }
1297   PetscCall(VecRestoreArrayRead(xx,&x));
1298   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1299   PetscCall(PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt));
1300   PetscFunctionReturn(0);
1301 }
1302 #endif
1303 
1304 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1305 /* Default MatMult for block size 15 */
1306 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1307 {
1308   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1309   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1310   const PetscScalar *x,*xb;
1311   PetscScalar       *zarray,xv;
1312   const MatScalar   *v;
1313   const PetscInt    *ii,*ij=a->j,*idx;
1314   PetscInt          mbs,i,j,k,n,*ridx=NULL;
1315   PetscBool         usecprow=a->compressedrow.use;
1316 
1317   PetscFunctionBegin;
1318   PetscCall(VecGetArrayRead(xx,&x));
1319   PetscCall(VecGetArrayWrite(zz,&zarray));
1320 
1321   v = a->a;
1322   if (usecprow) {
1323     mbs  = a->compressedrow.nrows;
1324     ii   = a->compressedrow.i;
1325     ridx = a->compressedrow.rindex;
1326     PetscCall(PetscArrayzero(zarray,15*a->mbs));
1327   } else {
1328     mbs = a->mbs;
1329     ii  = a->i;
1330     z   = zarray;
1331   }
1332 
1333   for (i=0; i<mbs; i++) {
1334     n    = ii[i+1] - ii[i];
1335     idx  = ij + ii[i];
1336     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1337     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1338 
1339     for (j=0; j<n; j++) {
1340       xb = x + 15*(idx[j]);
1341 
1342       for (k=0; k<15; k++) {
1343         xv     =  xb[k];
1344         sum1  += v[0]*xv;
1345         sum2  += v[1]*xv;
1346         sum3  += v[2]*xv;
1347         sum4  += v[3]*xv;
1348         sum5  += v[4]*xv;
1349         sum6  += v[5]*xv;
1350         sum7  += v[6]*xv;
1351         sum8  += v[7]*xv;
1352         sum9  += v[8]*xv;
1353         sum10 += v[9]*xv;
1354         sum11 += v[10]*xv;
1355         sum12 += v[11]*xv;
1356         sum13 += v[12]*xv;
1357         sum14 += v[13]*xv;
1358         sum15 += v[14]*xv;
1359         v     += 15;
1360       }
1361     }
1362     if (usecprow) z = zarray + 15*ridx[i];
1363     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1364     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1365 
1366     if (!usecprow) z += 15;
1367   }
1368 
1369   PetscCall(VecRestoreArrayRead(xx,&x));
1370   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1371   PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt));
1372   PetscFunctionReturn(0);
1373 }
1374 
1375 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1376 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1377 {
1378   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1379   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1380   const PetscScalar *x,*xb;
1381   PetscScalar       x1,x2,x3,x4,*zarray;
1382   const MatScalar   *v;
1383   const PetscInt    *ii,*ij=a->j,*idx;
1384   PetscInt          mbs,i,j,n,*ridx=NULL;
1385   PetscBool         usecprow=a->compressedrow.use;
1386 
1387   PetscFunctionBegin;
1388   PetscCall(VecGetArrayRead(xx,&x));
1389   PetscCall(VecGetArrayWrite(zz,&zarray));
1390 
1391   v = a->a;
1392   if (usecprow) {
1393     mbs  = a->compressedrow.nrows;
1394     ii   = a->compressedrow.i;
1395     ridx = a->compressedrow.rindex;
1396     PetscCall(PetscArrayzero(zarray,15*a->mbs));
1397   } else {
1398     mbs = a->mbs;
1399     ii  = a->i;
1400     z   = zarray;
1401   }
1402 
1403   for (i=0; i<mbs; i++) {
1404     n    = ii[i+1] - ii[i];
1405     idx  = ij + ii[i];
1406     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1407     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1408 
1409     for (j=0; j<n; j++) {
1410       xb = x + 15*(idx[j]);
1411       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1412 
1413       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1414       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1415       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1416       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1417       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1418       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1419       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1420       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1421       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1422       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1423       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1424       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1425       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1426       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1427       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1428 
1429       v += 60;
1430 
1431       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1432 
1433       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1434       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1435       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1436       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1437       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1438       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1439       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1440       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1441       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1442       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1443       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1444       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1445       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1446       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1447       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1448       v     += 60;
1449 
1450       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1451       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1452       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1453       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1454       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1455       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1456       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1457       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1458       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1459       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1460       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1461       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1462       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1463       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1464       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1465       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1466       v     += 60;
1467 
1468       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1469       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1470       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1471       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1472       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1473       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1474       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1475       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1476       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1477       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1478       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1479       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1480       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1481       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1482       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1483       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1484       v     += 45;
1485     }
1486     if (usecprow) z = zarray + 15*ridx[i];
1487     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1488     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1489 
1490     if (!usecprow) z += 15;
1491   }
1492 
1493   PetscCall(VecRestoreArrayRead(xx,&x));
1494   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1495   PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt));
1496   PetscFunctionReturn(0);
1497 }
1498 
1499 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1500 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1501 {
1502   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1503   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1504   const PetscScalar *x,*xb;
1505   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1506   const MatScalar   *v;
1507   const PetscInt    *ii,*ij=a->j,*idx;
1508   PetscInt          mbs,i,j,n,*ridx=NULL;
1509   PetscBool         usecprow=a->compressedrow.use;
1510 
1511   PetscFunctionBegin;
1512   PetscCall(VecGetArrayRead(xx,&x));
1513   PetscCall(VecGetArrayWrite(zz,&zarray));
1514 
1515   v = a->a;
1516   if (usecprow) {
1517     mbs  = a->compressedrow.nrows;
1518     ii   = a->compressedrow.i;
1519     ridx = a->compressedrow.rindex;
1520     PetscCall(PetscArrayzero(zarray,15*a->mbs));
1521   } else {
1522     mbs = a->mbs;
1523     ii  = a->i;
1524     z   = zarray;
1525   }
1526 
1527   for (i=0; i<mbs; i++) {
1528     n    = ii[i+1] - ii[i];
1529     idx  = ij + ii[i];
1530     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1531     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1532 
1533     for (j=0; j<n; j++) {
1534       xb = x + 15*(idx[j]);
1535       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1536       x8 = xb[7];
1537 
1538       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1539       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1540       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1541       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1542       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1543       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1544       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1545       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1546       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1547       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1548       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1549       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1550       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1551       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1552       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1553       v     += 120;
1554 
1555       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1556 
1557       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1558       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1559       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1560       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1561       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1562       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1563       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1564       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1565       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1566       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1567       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1568       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1569       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1570       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1571       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1572       v     += 105;
1573     }
1574     if (usecprow) z = zarray + 15*ridx[i];
1575     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1576     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1577 
1578     if (!usecprow) z += 15;
1579   }
1580 
1581   PetscCall(VecRestoreArrayRead(xx,&x));
1582   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1583   PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt));
1584   PetscFunctionReturn(0);
1585 }
1586 
1587 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1588 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1589 {
1590   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1591   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1592   const PetscScalar *x,*xb;
1593   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1594   const MatScalar   *v;
1595   const PetscInt    *ii,*ij=a->j,*idx;
1596   PetscInt          mbs,i,j,n,*ridx=NULL;
1597   PetscBool         usecprow=a->compressedrow.use;
1598 
1599   PetscFunctionBegin;
1600   PetscCall(VecGetArrayRead(xx,&x));
1601   PetscCall(VecGetArrayWrite(zz,&zarray));
1602 
1603   v = a->a;
1604   if (usecprow) {
1605     mbs  = a->compressedrow.nrows;
1606     ii   = a->compressedrow.i;
1607     ridx = a->compressedrow.rindex;
1608     PetscCall(PetscArrayzero(zarray,15*a->mbs));
1609   } else {
1610     mbs = a->mbs;
1611     ii  = a->i;
1612     z   = zarray;
1613   }
1614 
1615   for (i=0; i<mbs; i++) {
1616     n    = ii[i+1] - ii[i];
1617     idx  = ij + ii[i];
1618     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1619     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1620 
1621     for (j=0; j<n; j++) {
1622       xb = x + 15*(idx[j]);
1623       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1624       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1625 
1626       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1627       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1628       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1629       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1630       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1631       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1632       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1633       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1634       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1635       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1636       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1637       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1638       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1639       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1640       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1641       v     += 225;
1642     }
1643     if (usecprow) z = zarray + 15*ridx[i];
1644     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1645     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1646 
1647     if (!usecprow) z += 15;
1648   }
1649 
1650   PetscCall(VecRestoreArrayRead(xx,&x));
1651   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1652   PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt));
1653   PetscFunctionReturn(0);
1654 }
1655 
1656 /*
1657     This will not work with MatScalar == float because it calls the BLAS
1658 */
1659 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1660 {
1661   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1662   PetscScalar       *z = NULL,*work,*workt,*zarray;
1663   const PetscScalar *x,*xb;
1664   const MatScalar   *v;
1665   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1666   const PetscInt    *idx,*ii,*ridx=NULL;
1667   PetscInt          ncols,k;
1668   PetscBool         usecprow=a->compressedrow.use;
1669 
1670   PetscFunctionBegin;
1671   PetscCall(VecGetArrayRead(xx,&x));
1672   PetscCall(VecGetArrayWrite(zz,&zarray));
1673 
1674   idx = a->j;
1675   v   = a->a;
1676   if (usecprow) {
1677     mbs  = a->compressedrow.nrows;
1678     ii   = a->compressedrow.i;
1679     ridx = a->compressedrow.rindex;
1680     PetscCall(PetscArrayzero(zarray,bs*a->mbs));
1681   } else {
1682     mbs = a->mbs;
1683     ii  = a->i;
1684     z   = zarray;
1685   }
1686 
1687   if (!a->mult_work) {
1688     k    = PetscMax(A->rmap->n,A->cmap->n);
1689     PetscCall(PetscMalloc1(k+1,&a->mult_work));
1690   }
1691   work = a->mult_work;
1692   for (i=0; i<mbs; i++) {
1693     n           = ii[1] - ii[0]; ii++;
1694     ncols       = n*bs;
1695     workt       = work;
1696     for (j=0; j<n; j++) {
1697       xb = x + bs*(*idx++);
1698       for (k=0; k<bs; k++) workt[k] = xb[k];
1699       workt += bs;
1700     }
1701     if (usecprow) z = zarray + bs*ridx[i];
1702     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1703     v += n*bs2;
1704     if (!usecprow) z += bs;
1705   }
1706   PetscCall(VecRestoreArrayRead(xx,&x));
1707   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1708   PetscCall(PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt));
1709   PetscFunctionReturn(0);
1710 }
1711 
1712 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1713 {
1714   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1715   const PetscScalar *x;
1716   PetscScalar       *y,*z,sum;
1717   const MatScalar   *v;
1718   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1719   const PetscInt    *idx,*ii;
1720   PetscBool         usecprow=a->compressedrow.use;
1721 
1722   PetscFunctionBegin;
1723   PetscCall(VecGetArrayRead(xx,&x));
1724   PetscCall(VecGetArrayPair(yy,zz,&y,&z));
1725 
1726   idx = a->j;
1727   v   = a->a;
1728   if (usecprow) {
1729     if (zz != yy) {
1730       PetscCall(PetscArraycpy(z,y,mbs));
1731     }
1732     mbs  = a->compressedrow.nrows;
1733     ii   = a->compressedrow.i;
1734     ridx = a->compressedrow.rindex;
1735   } else {
1736     ii = a->i;
1737   }
1738 
1739   for (i=0; i<mbs; i++) {
1740     n = ii[1] - ii[0];
1741     ii++;
1742     if (!usecprow) {
1743       sum = y[i];
1744     } else {
1745       sum = y[ridx[i]];
1746     }
1747     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1748     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1749     PetscSparseDensePlusDot(sum,x,v,idx,n);
1750     v   += n;
1751     idx += n;
1752     if (usecprow) {
1753       z[ridx[i]] = sum;
1754     } else {
1755       z[i] = sum;
1756     }
1757   }
1758   PetscCall(VecRestoreArrayRead(xx,&x));
1759   PetscCall(VecRestoreArrayPair(yy,zz,&y,&z));
1760   PetscCall(PetscLogFlops(2.0*a->nz));
1761   PetscFunctionReturn(0);
1762 }
1763 
1764 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1765 {
1766   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1767   PetscScalar       *y = NULL,*z = NULL,sum1,sum2;
1768   const PetscScalar *x,*xb;
1769   PetscScalar       x1,x2,*yarray,*zarray;
1770   const MatScalar   *v;
1771   PetscInt          mbs = a->mbs,i,n,j;
1772   const PetscInt    *idx,*ii,*ridx = NULL;
1773   PetscBool         usecprow = a->compressedrow.use;
1774 
1775   PetscFunctionBegin;
1776   PetscCall(VecGetArrayRead(xx,&x));
1777   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
1778 
1779   idx = a->j;
1780   v   = a->a;
1781   if (usecprow) {
1782     if (zz != yy) {
1783       PetscCall(PetscArraycpy(zarray,yarray,2*mbs));
1784     }
1785     mbs  = a->compressedrow.nrows;
1786     ii   = a->compressedrow.i;
1787     ridx = a->compressedrow.rindex;
1788   } else {
1789     ii = a->i;
1790     y  = yarray;
1791     z  = zarray;
1792   }
1793 
1794   for (i=0; i<mbs; i++) {
1795     n = ii[1] - ii[0]; ii++;
1796     if (usecprow) {
1797       z = zarray + 2*ridx[i];
1798       y = yarray + 2*ridx[i];
1799     }
1800     sum1 = y[0]; sum2 = y[1];
1801     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1802     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1803     for (j=0; j<n; j++) {
1804       xb = x + 2*(*idx++);
1805       x1 = xb[0];
1806       x2 = xb[1];
1807 
1808       sum1 += v[0]*x1 + v[2]*x2;
1809       sum2 += v[1]*x1 + v[3]*x2;
1810       v    += 4;
1811     }
1812     z[0] = sum1; z[1] = sum2;
1813     if (!usecprow) {
1814       z += 2; y += 2;
1815     }
1816   }
1817   PetscCall(VecRestoreArrayRead(xx,&x));
1818   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
1819   PetscCall(PetscLogFlops(4.0*a->nz));
1820   PetscFunctionReturn(0);
1821 }
1822 
1823 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1824 {
1825   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1826   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1827   const PetscScalar *x,*xb;
1828   const MatScalar   *v;
1829   PetscInt          mbs = a->mbs,i,j,n;
1830   const PetscInt    *idx,*ii,*ridx = NULL;
1831   PetscBool         usecprow = a->compressedrow.use;
1832 
1833   PetscFunctionBegin;
1834   PetscCall(VecGetArrayRead(xx,&x));
1835   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
1836 
1837   idx = a->j;
1838   v   = a->a;
1839   if (usecprow) {
1840     if (zz != yy) {
1841       PetscCall(PetscArraycpy(zarray,yarray,3*mbs));
1842     }
1843     mbs  = a->compressedrow.nrows;
1844     ii   = a->compressedrow.i;
1845     ridx = a->compressedrow.rindex;
1846   } else {
1847     ii = a->i;
1848     y  = yarray;
1849     z  = zarray;
1850   }
1851 
1852   for (i=0; i<mbs; i++) {
1853     n = ii[1] - ii[0]; ii++;
1854     if (usecprow) {
1855       z = zarray + 3*ridx[i];
1856       y = yarray + 3*ridx[i];
1857     }
1858     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1859     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1860     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1861     for (j=0; j<n; j++) {
1862       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1863       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1864       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1865       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1866       v    += 9;
1867     }
1868     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1869     if (!usecprow) {
1870       z += 3; y += 3;
1871     }
1872   }
1873   PetscCall(VecRestoreArrayRead(xx,&x));
1874   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
1875   PetscCall(PetscLogFlops(18.0*a->nz));
1876   PetscFunctionReturn(0);
1877 }
1878 
1879 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1880 {
1881   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1882   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1883   const PetscScalar *x,*xb;
1884   const MatScalar   *v;
1885   PetscInt          mbs = a->mbs,i,j,n;
1886   const PetscInt    *idx,*ii,*ridx=NULL;
1887   PetscBool         usecprow=a->compressedrow.use;
1888 
1889   PetscFunctionBegin;
1890   PetscCall(VecGetArrayRead(xx,&x));
1891   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
1892 
1893   idx = a->j;
1894   v   = a->a;
1895   if (usecprow) {
1896     if (zz != yy) {
1897       PetscCall(PetscArraycpy(zarray,yarray,4*mbs));
1898     }
1899     mbs  = a->compressedrow.nrows;
1900     ii   = a->compressedrow.i;
1901     ridx = a->compressedrow.rindex;
1902   } else {
1903     ii = a->i;
1904     y  = yarray;
1905     z  = zarray;
1906   }
1907 
1908   for (i=0; i<mbs; i++) {
1909     n = ii[1] - ii[0]; ii++;
1910     if (usecprow) {
1911       z = zarray + 4*ridx[i];
1912       y = yarray + 4*ridx[i];
1913     }
1914     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1915     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1916     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1917     for (j=0; j<n; j++) {
1918       xb    = x + 4*(*idx++);
1919       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1920       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1921       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1922       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1923       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1924       v    += 16;
1925     }
1926     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1927     if (!usecprow) {
1928       z += 4; y += 4;
1929     }
1930   }
1931   PetscCall(VecRestoreArrayRead(xx,&x));
1932   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
1933   PetscCall(PetscLogFlops(32.0*a->nz));
1934   PetscFunctionReturn(0);
1935 }
1936 
1937 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1938 {
1939   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1940   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1941   const PetscScalar *x,*xb;
1942   PetscScalar       *yarray,*zarray;
1943   const MatScalar   *v;
1944   PetscInt          mbs = a->mbs,i,j,n;
1945   const PetscInt    *idx,*ii,*ridx = NULL;
1946   PetscBool         usecprow=a->compressedrow.use;
1947 
1948   PetscFunctionBegin;
1949   PetscCall(VecGetArrayRead(xx,&x));
1950   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
1951 
1952   idx = a->j;
1953   v   = a->a;
1954   if (usecprow) {
1955     if (zz != yy) {
1956       PetscCall(PetscArraycpy(zarray,yarray,5*mbs));
1957     }
1958     mbs  = a->compressedrow.nrows;
1959     ii   = a->compressedrow.i;
1960     ridx = a->compressedrow.rindex;
1961   } else {
1962     ii = a->i;
1963     y  = yarray;
1964     z  = zarray;
1965   }
1966 
1967   for (i=0; i<mbs; i++) {
1968     n = ii[1] - ii[0]; ii++;
1969     if (usecprow) {
1970       z = zarray + 5*ridx[i];
1971       y = yarray + 5*ridx[i];
1972     }
1973     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1974     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1975     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1976     for (j=0; j<n; j++) {
1977       xb    = x + 5*(*idx++);
1978       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1979       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1980       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1981       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1982       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1983       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1984       v    += 25;
1985     }
1986     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1987     if (!usecprow) {
1988       z += 5; y += 5;
1989     }
1990   }
1991   PetscCall(VecRestoreArrayRead(xx,&x));
1992   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
1993   PetscCall(PetscLogFlops(50.0*a->nz));
1994   PetscFunctionReturn(0);
1995 }
1996 
1997 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1998 {
1999   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2000   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
2001   const PetscScalar *x,*xb;
2002   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
2003   const MatScalar   *v;
2004   PetscInt          mbs = a->mbs,i,j,n;
2005   const PetscInt    *idx,*ii,*ridx=NULL;
2006   PetscBool         usecprow=a->compressedrow.use;
2007 
2008   PetscFunctionBegin;
2009   PetscCall(VecGetArrayRead(xx,&x));
2010   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
2011 
2012   idx = a->j;
2013   v   = a->a;
2014   if (usecprow) {
2015     if (zz != yy) {
2016       PetscCall(PetscArraycpy(zarray,yarray,6*mbs));
2017     }
2018     mbs  = a->compressedrow.nrows;
2019     ii   = a->compressedrow.i;
2020     ridx = a->compressedrow.rindex;
2021   } else {
2022     ii = a->i;
2023     y  = yarray;
2024     z  = zarray;
2025   }
2026 
2027   for (i=0; i<mbs; i++) {
2028     n = ii[1] - ii[0]; ii++;
2029     if (usecprow) {
2030       z = zarray + 6*ridx[i];
2031       y = yarray + 6*ridx[i];
2032     }
2033     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
2034     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2035     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2036     for (j=0; j<n; j++) {
2037       xb    = x + 6*(*idx++);
2038       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
2039       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
2040       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
2041       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
2042       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
2043       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
2044       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
2045       v    += 36;
2046     }
2047     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
2048     if (!usecprow) {
2049       z += 6; y += 6;
2050     }
2051   }
2052   PetscCall(VecRestoreArrayRead(xx,&x));
2053   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
2054   PetscCall(PetscLogFlops(72.0*a->nz));
2055   PetscFunctionReturn(0);
2056 }
2057 
2058 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2059 {
2060   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2061   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2062   const PetscScalar *x,*xb;
2063   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2064   const MatScalar   *v;
2065   PetscInt          mbs = a->mbs,i,j,n;
2066   const PetscInt    *idx,*ii,*ridx = NULL;
2067   PetscBool         usecprow=a->compressedrow.use;
2068 
2069   PetscFunctionBegin;
2070   PetscCall(VecGetArrayRead(xx,&x));
2071   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
2072 
2073   idx = a->j;
2074   v   = a->a;
2075   if (usecprow) {
2076     if (zz != yy) {
2077       PetscCall(PetscArraycpy(zarray,yarray,7*mbs));
2078     }
2079     mbs  = a->compressedrow.nrows;
2080     ii   = a->compressedrow.i;
2081     ridx = a->compressedrow.rindex;
2082   } else {
2083     ii = a->i;
2084     y  = yarray;
2085     z  = zarray;
2086   }
2087 
2088   for (i=0; i<mbs; i++) {
2089     n = ii[1] - ii[0]; ii++;
2090     if (usecprow) {
2091       z = zarray + 7*ridx[i];
2092       y = yarray + 7*ridx[i];
2093     }
2094     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2095     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2096     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2097     for (j=0; j<n; j++) {
2098       xb    = x + 7*(*idx++);
2099       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2100       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2101       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2102       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2103       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2104       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2105       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2106       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2107       v    += 49;
2108     }
2109     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2110     if (!usecprow) {
2111       z += 7; y += 7;
2112     }
2113   }
2114   PetscCall(VecRestoreArrayRead(xx,&x));
2115   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
2116   PetscCall(PetscLogFlops(98.0*a->nz));
2117   PetscFunctionReturn(0);
2118 }
2119 
2120 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2121 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
2122 {
2123   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2124   PetscScalar       *z = NULL,*work,*workt,*zarray;
2125   const PetscScalar *x,*xb;
2126   const MatScalar   *v;
2127   PetscInt          mbs,i,j,n;
2128   PetscInt          k;
2129   PetscBool         usecprow=a->compressedrow.use;
2130   const PetscInt    *idx,*ii,*ridx=NULL,bs = 9, bs2 = 81;
2131 
2132   __m256d a0,a1,a2,a3,a4,a5;
2133   __m256d w0,w1,w2,w3;
2134   __m256d z0,z1,z2;
2135   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
2136 
2137   PetscFunctionBegin;
2138   PetscCall(VecCopy(yy,zz));
2139   PetscCall(VecGetArrayRead(xx,&x));
2140   PetscCall(VecGetArray(zz,&zarray));
2141 
2142   idx = a->j;
2143   v   = a->a;
2144   if (usecprow) {
2145     mbs  = a->compressedrow.nrows;
2146     ii   = a->compressedrow.i;
2147     ridx = a->compressedrow.rindex;
2148   } else {
2149     mbs = a->mbs;
2150     ii  = a->i;
2151     z   = zarray;
2152   }
2153 
2154   if (!a->mult_work) {
2155     k    = PetscMax(A->rmap->n,A->cmap->n);
2156     PetscCall(PetscMalloc1(k+1,&a->mult_work));
2157   }
2158 
2159   work = a->mult_work;
2160   for (i=0; i<mbs; i++) {
2161     n           = ii[1] - ii[0]; ii++;
2162     workt       = work;
2163     for (j=0; j<n; j++) {
2164       xb = x + bs*(*idx++);
2165       for (k=0; k<bs; k++) workt[k] = xb[k];
2166       workt += bs;
2167     }
2168     if (usecprow) z = zarray + bs*ridx[i];
2169 
2170     z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);
2171 
2172     for (j=0; j<n; j++) {
2173       /* first column of a */
2174       w0 = _mm256_set1_pd(work[j*9  ]);
2175       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2176       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2177       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
2178 
2179       /* second column of a */
2180       w1 = _mm256_set1_pd(work[j*9+ 1]);
2181       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2182       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2183       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
2184 
2185       /* third column of a */
2186       w2 = _mm256_set1_pd(work[j*9+ 2]);
2187       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
2188       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
2189       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
2190 
2191       /* fourth column of a */
2192       w3 = _mm256_set1_pd(work[j*9+ 3]);
2193       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
2194       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
2195       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
2196 
2197       /* fifth column of a */
2198       w0 = _mm256_set1_pd(work[j*9+ 4]);
2199       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
2200       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
2201       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
2202 
2203       /* sixth column of a */
2204       w1 = _mm256_set1_pd(work[j*9+ 5]);
2205       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2206       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2207       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
2208 
2209       /* seventh column of a */
2210       w2 = _mm256_set1_pd(work[j*9+ 6]);
2211       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
2212       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
2213       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
2214 
2215       /* eighth column of a */
2216       w3 = _mm256_set1_pd(work[j*9+ 7]);
2217       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
2218       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
2219       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
2220 
2221       /* ninth column of a */
2222       w0 = _mm256_set1_pd(work[j*9+ 8]);
2223       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2224       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2225       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
2226     }
2227 
2228     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
2229 
2230     v += n*bs2;
2231     if (!usecprow) z += bs;
2232   }
2233   PetscCall(VecRestoreArrayRead(xx,&x));
2234   PetscCall(VecRestoreArray(zz,&zarray));
2235   PetscCall(PetscLogFlops(162.0*a->nz));
2236   PetscFunctionReturn(0);
2237 }
2238 #endif
2239 
2240 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
2241 {
2242   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2243   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
2244   const PetscScalar *x,*xb;
2245   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
2246   const MatScalar   *v;
2247   PetscInt          mbs = a->mbs,i,j,n;
2248   const PetscInt    *idx,*ii,*ridx = NULL;
2249   PetscBool         usecprow=a->compressedrow.use;
2250 
2251   PetscFunctionBegin;
2252   PetscCall(VecGetArrayRead(xx,&x));
2253   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
2254 
2255   idx = a->j;
2256   v   = a->a;
2257   if (usecprow) {
2258     if (zz != yy) {
2259       PetscCall(PetscArraycpy(zarray,yarray,7*mbs));
2260     }
2261     mbs  = a->compressedrow.nrows;
2262     ii   = a->compressedrow.i;
2263     ridx = a->compressedrow.rindex;
2264   } else {
2265     ii = a->i;
2266     y  = yarray;
2267     z  = zarray;
2268   }
2269 
2270   for (i=0; i<mbs; i++) {
2271     n = ii[1] - ii[0]; ii++;
2272     if (usecprow) {
2273       z = zarray + 11*ridx[i];
2274       y = yarray + 11*ridx[i];
2275     }
2276     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2277     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
2278     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2279     PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2280     for (j=0; j<n; j++) {
2281       xb    = x + 11*(*idx++);
2282       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
2283       sum1 += v[  0]*x1 + v[  11]*x2  + v[2*11]*x3  + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9  + v[9*11]*x10  + v[10*11]*x11;
2284       sum2 += v[1+0]*x1 + v[1+11]*x2  + v[1+2*11]*x3  + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9  + v[1+9*11]*x10  + v[1+10*11]*x11;
2285       sum3 += v[2+0]*x1 + v[2+11]*x2  + v[2+2*11]*x3  + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9  + v[2+9*11]*x10  + v[2+10*11]*x11;
2286       sum4 += v[3+0]*x1 + v[3+11]*x2  + v[3+2*11]*x3  + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9  + v[3+9*11]*x10  + v[3+10*11]*x11;
2287       sum5 += v[4+0]*x1 + v[4+11]*x2  + v[4+2*11]*x3  + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9  + v[4+9*11]*x10  + v[4+10*11]*x11;
2288       sum6 += v[5+0]*x1 + v[5+11]*x2  + v[5+2*11]*x3  + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9  + v[5+9*11]*x10  + v[5+10*11]*x11;
2289       sum7 += v[6+0]*x1 + v[6+11]*x2  + v[6+2*11]*x3  + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9  + v[6+9*11]*x10  + v[6+10*11]*x11;
2290       sum8 += v[7+0]*x1 + v[7+11]*x2  + v[7+2*11]*x3  + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9  + v[7+9*11]*x10  + v[7+10*11]*x11;
2291       sum9 += v[8+0]*x1 + v[8+11]*x2  + v[8+2*11]*x3  + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9  + v[8+9*11]*x10  + v[8+10*11]*x11;
2292       sum10 += v[9+0]*x1 + v[9+11]*x2  + v[9+2*11]*x3  + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9  + v[9+9*11]*x10  + v[9+10*11]*x11;
2293       sum11 += v[10+0]*x1 + v[10+11]*x2  + v[10+2*11]*x3  + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9  + v[10+9*11]*x10  + v[10+10*11]*x11;
2294       v    += 121;
2295     }
2296     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2297     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
2298     if (!usecprow) {
2299       z += 11; y += 11;
2300     }
2301   }
2302   PetscCall(VecRestoreArrayRead(xx,&x));
2303   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
2304   PetscCall(PetscLogFlops(242.0*a->nz));
2305   PetscFunctionReturn(0);
2306 }
2307 
2308 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2309 {
2310   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2311   PetscScalar       *z = NULL,*work,*workt,*zarray;
2312   const PetscScalar *x,*xb;
2313   const MatScalar   *v;
2314   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2315   PetscInt          ncols,k;
2316   const PetscInt    *ridx = NULL,*idx,*ii;
2317   PetscBool         usecprow = a->compressedrow.use;
2318 
2319   PetscFunctionBegin;
2320   PetscCall(VecCopy(yy,zz));
2321   PetscCall(VecGetArrayRead(xx,&x));
2322   PetscCall(VecGetArray(zz,&zarray));
2323 
2324   idx = a->j;
2325   v   = a->a;
2326   if (usecprow) {
2327     mbs  = a->compressedrow.nrows;
2328     ii   = a->compressedrow.i;
2329     ridx = a->compressedrow.rindex;
2330   } else {
2331     mbs = a->mbs;
2332     ii  = a->i;
2333     z   = zarray;
2334   }
2335 
2336   if (!a->mult_work) {
2337     k    = PetscMax(A->rmap->n,A->cmap->n);
2338     PetscCall(PetscMalloc1(k+1,&a->mult_work));
2339   }
2340   work = a->mult_work;
2341   for (i=0; i<mbs; i++) {
2342     n     = ii[1] - ii[0]; ii++;
2343     ncols = n*bs;
2344     workt = work;
2345     for (j=0; j<n; j++) {
2346       xb = x + bs*(*idx++);
2347       for (k=0; k<bs; k++) workt[k] = xb[k];
2348       workt += bs;
2349     }
2350     if (usecprow) z = zarray + bs*ridx[i];
2351     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2352     v += n*bs2;
2353     if (!usecprow) z += bs;
2354   }
2355   PetscCall(VecRestoreArrayRead(xx,&x));
2356   PetscCall(VecRestoreArray(zz,&zarray));
2357   PetscCall(PetscLogFlops(2.0*a->nz*bs2));
2358   PetscFunctionReturn(0);
2359 }
2360 
2361 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2362 {
2363   PetscScalar    zero = 0.0;
2364 
2365   PetscFunctionBegin;
2366   PetscCall(VecSet(zz,zero));
2367   PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz));
2368   PetscFunctionReturn(0);
2369 }
2370 
2371 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2372 {
2373   PetscScalar    zero = 0.0;
2374 
2375   PetscFunctionBegin;
2376   PetscCall(VecSet(zz,zero));
2377   PetscCall(MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz));
2378   PetscFunctionReturn(0);
2379 }
2380 
2381 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2382 {
2383   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2384   PetscScalar       *z,x1,x2,x3,x4,x5;
2385   const PetscScalar *x,*xb = NULL;
2386   const MatScalar   *v;
2387   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
2388   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2389   Mat_CompressedRow cprow = a->compressedrow;
2390   PetscBool         usecprow = cprow.use;
2391 
2392   PetscFunctionBegin;
2393   if (yy != zz) PetscCall(VecCopy(yy,zz));
2394   PetscCall(VecGetArrayRead(xx,&x));
2395   PetscCall(VecGetArray(zz,&z));
2396 
2397   idx = a->j;
2398   v   = a->a;
2399   if (usecprow) {
2400     mbs  = cprow.nrows;
2401     ii   = cprow.i;
2402     ridx = cprow.rindex;
2403   } else {
2404     mbs=a->mbs;
2405     ii = a->i;
2406     xb = x;
2407   }
2408 
2409   switch (bs) {
2410   case 1:
2411     for (i=0; i<mbs; i++) {
2412       if (usecprow) xb = x + ridx[i];
2413       x1 = xb[0];
2414       ib = idx + ii[0];
2415       n  = ii[1] - ii[0]; ii++;
2416       for (j=0; j<n; j++) {
2417         rval     = ib[j];
2418         z[rval] += PetscConj(*v) * x1;
2419         v++;
2420       }
2421       if (!usecprow) xb++;
2422     }
2423     break;
2424   case 2:
2425     for (i=0; i<mbs; i++) {
2426       if (usecprow) xb = x + 2*ridx[i];
2427       x1 = xb[0]; x2 = xb[1];
2428       ib = idx + ii[0];
2429       n  = ii[1] - ii[0]; ii++;
2430       for (j=0; j<n; j++) {
2431         rval       = ib[j]*2;
2432         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2433         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2434         v         += 4;
2435       }
2436       if (!usecprow) xb += 2;
2437     }
2438     break;
2439   case 3:
2440     for (i=0; i<mbs; i++) {
2441       if (usecprow) xb = x + 3*ridx[i];
2442       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2443       ib = idx + ii[0];
2444       n  = ii[1] - ii[0]; ii++;
2445       for (j=0; j<n; j++) {
2446         rval       = ib[j]*3;
2447         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2448         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2449         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2450         v         += 9;
2451       }
2452       if (!usecprow) xb += 3;
2453     }
2454     break;
2455   case 4:
2456     for (i=0; i<mbs; i++) {
2457       if (usecprow) xb = x + 4*ridx[i];
2458       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2459       ib = idx + ii[0];
2460       n  = ii[1] - ii[0]; ii++;
2461       for (j=0; j<n; j++) {
2462         rval       = ib[j]*4;
2463         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2464         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2465         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2466         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2467         v         += 16;
2468       }
2469       if (!usecprow) xb += 4;
2470     }
2471     break;
2472   case 5:
2473     for (i=0; i<mbs; i++) {
2474       if (usecprow) xb = x + 5*ridx[i];
2475       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2476       x4 = xb[3]; x5 = xb[4];
2477       ib = idx + ii[0];
2478       n  = ii[1] - ii[0]; ii++;
2479       for (j=0; j<n; j++) {
2480         rval       = ib[j]*5;
2481         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2482         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2483         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2484         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2485         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2486         v         += 25;
2487       }
2488       if (!usecprow) xb += 5;
2489     }
2490     break;
2491   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2492     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2493 #if 0
2494     {
2495       PetscInt          ncols,k,bs2=a->bs2;
2496       PetscScalar       *work,*workt,zb;
2497       const PetscScalar *xtmp;
2498       if (!a->mult_work) {
2499         k    = PetscMax(A->rmap->n,A->cmap->n);
2500         PetscCall(PetscMalloc1(k+1,&a->mult_work));
2501       }
2502       work = a->mult_work;
2503       xtmp = x;
2504       for (i=0; i<mbs; i++) {
2505         n     = ii[1] - ii[0]; ii++;
2506         ncols = n*bs;
2507         PetscCall(PetscArrayzero(work,ncols));
2508         if (usecprow) xtmp = x + bs*ridx[i];
2509         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2510         v += n*bs2;
2511         if (!usecprow) xtmp += bs;
2512         workt = work;
2513         for (j=0; j<n; j++) {
2514           zb = z + bs*(*idx++);
2515           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2516           workt += bs;
2517         }
2518       }
2519     }
2520 #endif
2521   }
2522   PetscCall(VecRestoreArrayRead(xx,&x));
2523   PetscCall(VecRestoreArray(zz,&z));
2524   PetscCall(PetscLogFlops(2.0*a->nz*a->bs2));
2525   PetscFunctionReturn(0);
2526 }
2527 
2528 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2529 {
2530   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2531   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2532   const PetscScalar *x,*xb = NULL;
2533   const MatScalar   *v;
2534   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2535   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2536   Mat_CompressedRow cprow   = a->compressedrow;
2537   PetscBool         usecprow=cprow.use;
2538 
2539   PetscFunctionBegin;
2540   if (yy != zz) PetscCall(VecCopy(yy,zz));
2541   PetscCall(VecGetArrayRead(xx,&x));
2542   PetscCall(VecGetArray(zz,&z));
2543 
2544   idx = a->j;
2545   v   = a->a;
2546   if (usecprow) {
2547     mbs  = cprow.nrows;
2548     ii   = cprow.i;
2549     ridx = cprow.rindex;
2550   } else {
2551     mbs=a->mbs;
2552     ii = a->i;
2553     xb = x;
2554   }
2555 
2556   switch (bs) {
2557   case 1:
2558     for (i=0; i<mbs; i++) {
2559       if (usecprow) xb = x + ridx[i];
2560       x1 = xb[0];
2561       ib = idx + ii[0];
2562       n  = ii[1] - ii[0]; ii++;
2563       for (j=0; j<n; j++) {
2564         rval     = ib[j];
2565         z[rval] += *v * x1;
2566         v++;
2567       }
2568       if (!usecprow) xb++;
2569     }
2570     break;
2571   case 2:
2572     for (i=0; i<mbs; i++) {
2573       if (usecprow) xb = x + 2*ridx[i];
2574       x1 = xb[0]; x2 = xb[1];
2575       ib = idx + ii[0];
2576       n  = ii[1] - ii[0]; ii++;
2577       for (j=0; j<n; j++) {
2578         rval       = ib[j]*2;
2579         z[rval++] += v[0]*x1 + v[1]*x2;
2580         z[rval++] += v[2]*x1 + v[3]*x2;
2581         v         += 4;
2582       }
2583       if (!usecprow) xb += 2;
2584     }
2585     break;
2586   case 3:
2587     for (i=0; i<mbs; i++) {
2588       if (usecprow) xb = x + 3*ridx[i];
2589       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2590       ib = idx + ii[0];
2591       n  = ii[1] - ii[0]; ii++;
2592       for (j=0; j<n; j++) {
2593         rval       = ib[j]*3;
2594         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2595         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2596         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2597         v         += 9;
2598       }
2599       if (!usecprow) xb += 3;
2600     }
2601     break;
2602   case 4:
2603     for (i=0; i<mbs; i++) {
2604       if (usecprow) xb = x + 4*ridx[i];
2605       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2606       ib = idx + ii[0];
2607       n  = ii[1] - ii[0]; ii++;
2608       for (j=0; j<n; j++) {
2609         rval       = ib[j]*4;
2610         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2611         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2612         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2613         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2614         v         += 16;
2615       }
2616       if (!usecprow) xb += 4;
2617     }
2618     break;
2619   case 5:
2620     for (i=0; i<mbs; i++) {
2621       if (usecprow) xb = x + 5*ridx[i];
2622       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2623       x4 = xb[3]; x5 = xb[4];
2624       ib = idx + ii[0];
2625       n  = ii[1] - ii[0]; ii++;
2626       for (j=0; j<n; j++) {
2627         rval       = ib[j]*5;
2628         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2629         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2630         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2631         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2632         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2633         v         += 25;
2634       }
2635       if (!usecprow) xb += 5;
2636     }
2637     break;
2638   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2639     PetscInt          ncols,k;
2640     PetscScalar       *work,*workt;
2641     const PetscScalar *xtmp;
2642     if (!a->mult_work) {
2643       k    = PetscMax(A->rmap->n,A->cmap->n);
2644       PetscCall(PetscMalloc1(k+1,&a->mult_work));
2645     }
2646     work = a->mult_work;
2647     xtmp = x;
2648     for (i=0; i<mbs; i++) {
2649       n     = ii[1] - ii[0]; ii++;
2650       ncols = n*bs;
2651       PetscCall(PetscArrayzero(work,ncols));
2652       if (usecprow) xtmp = x + bs*ridx[i];
2653       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2654       v += n*bs2;
2655       if (!usecprow) xtmp += bs;
2656       workt = work;
2657       for (j=0; j<n; j++) {
2658         zb = z + bs*(*idx++);
2659         for (k=0; k<bs; k++) zb[k] += workt[k];
2660         workt += bs;
2661       }
2662     }
2663     }
2664   }
2665   PetscCall(VecRestoreArrayRead(xx,&x));
2666   PetscCall(VecRestoreArray(zz,&z));
2667   PetscCall(PetscLogFlops(2.0*a->nz*a->bs2));
2668   PetscFunctionReturn(0);
2669 }
2670 
2671 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2672 {
2673   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2674   PetscInt       totalnz = a->bs2*a->nz;
2675   PetscScalar    oalpha  = alpha;
2676   PetscBLASInt   one = 1,tnz;
2677 
2678   PetscFunctionBegin;
2679   PetscCall(PetscBLASIntCast(totalnz,&tnz));
2680   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2681   PetscCall(PetscLogFlops(totalnz));
2682   PetscFunctionReturn(0);
2683 }
2684 
2685 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2686 {
2687   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2688   MatScalar      *v  = a->a;
2689   PetscReal      sum = 0.0;
2690   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2691 
2692   PetscFunctionBegin;
2693   if (type == NORM_FROBENIUS) {
2694 #if defined(PETSC_USE_REAL___FP16)
2695     PetscBLASInt one = 1,cnt = bs2*nz;
2696     PetscStackCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one));
2697 #else
2698     for (i=0; i<bs2*nz; i++) {
2699       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2700     }
2701 #endif
2702     *norm = PetscSqrtReal(sum);
2703     PetscCall(PetscLogFlops(2.0*bs2*nz));
2704   } else if (type == NORM_1) { /* maximum column sum */
2705     PetscReal *tmp;
2706     PetscInt  *bcol = a->j;
2707     PetscCall(PetscCalloc1(A->cmap->n+1,&tmp));
2708     for (i=0; i<nz; i++) {
2709       for (j=0; j<bs; j++) {
2710         k1 = bs*(*bcol) + j; /* column index */
2711         for (k=0; k<bs; k++) {
2712           tmp[k1] += PetscAbsScalar(*v); v++;
2713         }
2714       }
2715       bcol++;
2716     }
2717     *norm = 0.0;
2718     for (j=0; j<A->cmap->n; j++) {
2719       if (tmp[j] > *norm) *norm = tmp[j];
2720     }
2721     PetscCall(PetscFree(tmp));
2722     PetscCall(PetscLogFlops(PetscMax(bs2*nz-1,0)));
2723   } else if (type == NORM_INFINITY) { /* maximum row sum */
2724     *norm = 0.0;
2725     for (k=0; k<bs; k++) {
2726       for (j=0; j<a->mbs; j++) {
2727         v   = a->a + bs2*a->i[j] + k;
2728         sum = 0.0;
2729         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2730           for (k1=0; k1<bs; k1++) {
2731             sum += PetscAbsScalar(*v);
2732             v   += bs;
2733           }
2734         }
2735         if (sum > *norm) *norm = sum;
2736       }
2737     }
2738     PetscCall(PetscLogFlops(PetscMax(bs2*nz-1,0)));
2739   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2740   PetscFunctionReturn(0);
2741 }
2742 
2743 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2744 {
2745   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2746 
2747   PetscFunctionBegin;
2748   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2749   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2750     *flg = PETSC_FALSE;
2751     PetscFunctionReturn(0);
2752   }
2753 
2754   /* if the a->i are the same */
2755   PetscCall(PetscArraycmp(a->i,b->i,a->mbs+1,flg));
2756   if (!*flg) PetscFunctionReturn(0);
2757 
2758   /* if a->j are the same */
2759   PetscCall(PetscArraycmp(a->j,b->j,a->nz,flg));
2760   if (!*flg) PetscFunctionReturn(0);
2761 
2762   /* if a->a are the same */
2763   PetscCall(PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg));
2764   PetscFunctionReturn(0);
2765 
2766 }
2767 
2768 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2769 {
2770   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2771   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2772   PetscScalar    *x,zero = 0.0;
2773   MatScalar      *aa,*aa_j;
2774 
2775   PetscFunctionBegin;
2776   PetscCheck(!A->factortype,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2777   bs   = A->rmap->bs;
2778   aa   = a->a;
2779   ai   = a->i;
2780   aj   = a->j;
2781   ambs = a->mbs;
2782   bs2  = a->bs2;
2783 
2784   PetscCall(VecSet(v,zero));
2785   PetscCall(VecGetArray(v,&x));
2786   PetscCall(VecGetLocalSize(v,&n));
2787   PetscCheck(n == A->rmap->N,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2788   for (i=0; i<ambs; i++) {
2789     for (j=ai[i]; j<ai[i+1]; j++) {
2790       if (aj[j] == i) {
2791         row  = i*bs;
2792         aa_j = aa+j*bs2;
2793         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2794         break;
2795       }
2796     }
2797   }
2798   PetscCall(VecRestoreArray(v,&x));
2799   PetscFunctionReturn(0);
2800 }
2801 
2802 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2803 {
2804   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2805   const PetscScalar *l,*r,*li,*ri;
2806   PetscScalar       x;
2807   MatScalar         *aa, *v;
2808   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2809   const PetscInt    *ai,*aj;
2810 
2811   PetscFunctionBegin;
2812   ai  = a->i;
2813   aj  = a->j;
2814   aa  = a->a;
2815   m   = A->rmap->n;
2816   n   = A->cmap->n;
2817   bs  = A->rmap->bs;
2818   mbs = a->mbs;
2819   bs2 = a->bs2;
2820   if (ll) {
2821     PetscCall(VecGetArrayRead(ll,&l));
2822     PetscCall(VecGetLocalSize(ll,&lm));
2823     PetscCheck(lm == m,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2824     for (i=0; i<mbs; i++) { /* for each block row */
2825       M  = ai[i+1] - ai[i];
2826       li = l + i*bs;
2827       v  = aa + bs2*ai[i];
2828       for (j=0; j<M; j++) { /* for each block */
2829         for (k=0; k<bs2; k++) {
2830           (*v++) *= li[k%bs];
2831         }
2832       }
2833     }
2834     PetscCall(VecRestoreArrayRead(ll,&l));
2835     PetscCall(PetscLogFlops(a->nz));
2836   }
2837 
2838   if (rr) {
2839     PetscCall(VecGetArrayRead(rr,&r));
2840     PetscCall(VecGetLocalSize(rr,&rn));
2841     PetscCheck(rn == n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2842     for (i=0; i<mbs; i++) { /* for each block row */
2843       iai = ai[i];
2844       M   = ai[i+1] - iai;
2845       v   = aa + bs2*iai;
2846       for (j=0; j<M; j++) { /* for each block */
2847         ri = r + bs*aj[iai+j];
2848         for (k=0; k<bs; k++) {
2849           x = ri[k];
2850           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2851           v += bs;
2852         }
2853       }
2854     }
2855     PetscCall(VecRestoreArrayRead(rr,&r));
2856     PetscCall(PetscLogFlops(a->nz));
2857   }
2858   PetscFunctionReturn(0);
2859 }
2860 
2861 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2862 {
2863   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2864 
2865   PetscFunctionBegin;
2866   info->block_size   = a->bs2;
2867   info->nz_allocated = a->bs2*a->maxnz;
2868   info->nz_used      = a->bs2*a->nz;
2869   info->nz_unneeded  = info->nz_allocated - info->nz_used;
2870   info->assemblies   = A->num_ass;
2871   info->mallocs      = A->info.mallocs;
2872   info->memory       = ((PetscObject)A)->mem;
2873   if (A->factortype) {
2874     info->fill_ratio_given  = A->info.fill_ratio_given;
2875     info->fill_ratio_needed = A->info.fill_ratio_needed;
2876     info->factor_mallocs    = A->info.factor_mallocs;
2877   } else {
2878     info->fill_ratio_given  = 0;
2879     info->fill_ratio_needed = 0;
2880     info->factor_mallocs    = 0;
2881   }
2882   PetscFunctionReturn(0);
2883 }
2884 
2885 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2886 {
2887   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2888 
2889   PetscFunctionBegin;
2890   PetscCall(PetscArrayzero(a->a,a->bs2*a->i[a->mbs]));
2891   PetscFunctionReturn(0);
2892 }
2893 
2894 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
2895 {
2896   PetscFunctionBegin;
2897   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C));
2898   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
2899   PetscFunctionReturn(0);
2900 }
2901 
2902 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2903 {
2904   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2905   PetscScalar       *z = NULL,sum1;
2906   const PetscScalar *xb;
2907   PetscScalar       x1;
2908   const MatScalar   *v,*vv;
2909   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2910   PetscBool         usecprow=a->compressedrow.use;
2911 
2912   PetscFunctionBegin;
2913   idx = a->j;
2914   v   = a->a;
2915   if (usecprow) {
2916     mbs  = a->compressedrow.nrows;
2917     ii   = a->compressedrow.i;
2918     ridx = a->compressedrow.rindex;
2919   } else {
2920     mbs = a->mbs;
2921     ii  = a->i;
2922     z   = c;
2923   }
2924 
2925   for (i=0; i<mbs; i++) {
2926     n           = ii[1] - ii[0]; ii++;
2927     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2928     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2929     if (usecprow) z = c + ridx[i];
2930     jj = idx;
2931     vv = v;
2932     for (k=0; k<cn; k++) {
2933       idx = jj;
2934       v = vv;
2935       sum1    = 0.0;
2936       for (j=0; j<n; j++) {
2937         xb    = b + (*idx++); x1 = xb[0+k*bm];
2938         sum1 += v[0]*x1;
2939         v    += 1;
2940       }
2941       z[0+k*cm] = sum1;
2942     }
2943     if (!usecprow) z += 1;
2944   }
2945   PetscFunctionReturn(0);
2946 }
2947 
2948 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2949 {
2950   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2951   PetscScalar       *z = NULL,sum1,sum2;
2952   const PetscScalar *xb;
2953   PetscScalar       x1,x2;
2954   const MatScalar   *v,*vv;
2955   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2956   PetscBool         usecprow=a->compressedrow.use;
2957 
2958   PetscFunctionBegin;
2959   idx = a->j;
2960   v   = a->a;
2961   if (usecprow) {
2962     mbs  = a->compressedrow.nrows;
2963     ii   = a->compressedrow.i;
2964     ridx = a->compressedrow.rindex;
2965   } else {
2966     mbs = a->mbs;
2967     ii  = a->i;
2968     z   = c;
2969   }
2970 
2971   for (i=0; i<mbs; i++) {
2972     n           = ii[1] - ii[0]; ii++;
2973     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2974     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2975     if (usecprow) z = c + 2*ridx[i];
2976     jj = idx;
2977     vv = v;
2978     for (k=0; k<cn; k++) {
2979       idx = jj;
2980       v = vv;
2981       sum1    = 0.0; sum2 = 0.0;
2982       for (j=0; j<n; j++) {
2983         xb    = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
2984         sum1 += v[0]*x1 + v[2]*x2;
2985         sum2 += v[1]*x1 + v[3]*x2;
2986         v    += 4;
2987       }
2988       z[0+k*cm] = sum1; z[1+k*cm] = sum2;
2989     }
2990     if (!usecprow) z += 2;
2991   }
2992   PetscFunctionReturn(0);
2993 }
2994 
2995 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2996 {
2997   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2998   PetscScalar       *z = NULL,sum1,sum2,sum3;
2999   const PetscScalar *xb;
3000   PetscScalar       x1,x2,x3;
3001   const MatScalar   *v,*vv;
3002   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3003   PetscBool         usecprow=a->compressedrow.use;
3004 
3005   PetscFunctionBegin;
3006   idx = a->j;
3007   v   = a->a;
3008   if (usecprow) {
3009     mbs  = a->compressedrow.nrows;
3010     ii   = a->compressedrow.i;
3011     ridx = a->compressedrow.rindex;
3012   } else {
3013     mbs = a->mbs;
3014     ii  = a->i;
3015     z   = c;
3016   }
3017 
3018   for (i=0; i<mbs; i++) {
3019     n           = ii[1] - ii[0]; ii++;
3020     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3021     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3022     if (usecprow) z = c + 3*ridx[i];
3023     jj = idx;
3024     vv = v;
3025     for (k=0; k<cn; k++) {
3026       idx = jj;
3027       v = vv;
3028       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0;
3029       for (j=0; j<n; j++) {
3030         xb    = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
3031         sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
3032         sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
3033         sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
3034         v    += 9;
3035       }
3036       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3;
3037     }
3038     if (!usecprow) z += 3;
3039   }
3040   PetscFunctionReturn(0);
3041 }
3042 
3043 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3044 {
3045   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3046   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4;
3047   const PetscScalar *xb;
3048   PetscScalar       x1,x2,x3,x4;
3049   const MatScalar   *v,*vv;
3050   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3051   PetscBool         usecprow=a->compressedrow.use;
3052 
3053   PetscFunctionBegin;
3054   idx = a->j;
3055   v   = a->a;
3056   if (usecprow) {
3057     mbs  = a->compressedrow.nrows;
3058     ii   = a->compressedrow.i;
3059     ridx = a->compressedrow.rindex;
3060   } else {
3061     mbs = a->mbs;
3062     ii  = a->i;
3063     z   = c;
3064   }
3065 
3066   for (i=0; i<mbs; i++) {
3067     n           = ii[1] - ii[0]; ii++;
3068     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3069     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3070     if (usecprow) z = c + 4*ridx[i];
3071     jj = idx;
3072     vv = v;
3073     for (k=0; k<cn; k++) {
3074       idx = jj;
3075       v = vv;
3076       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
3077       for (j=0; j<n; j++) {
3078         xb    = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
3079         sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3080         sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3081         sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3082         sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3083         v    += 16;
3084       }
3085       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4;
3086     }
3087     if (!usecprow) z += 4;
3088   }
3089   PetscFunctionReturn(0);
3090 }
3091 
3092 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3093 {
3094   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3095   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5;
3096   const PetscScalar *xb;
3097   PetscScalar       x1,x2,x3,x4,x5;
3098   const MatScalar   *v,*vv;
3099   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3100   PetscBool         usecprow=a->compressedrow.use;
3101 
3102   PetscFunctionBegin;
3103   idx = a->j;
3104   v   = a->a;
3105   if (usecprow) {
3106     mbs  = a->compressedrow.nrows;
3107     ii   = a->compressedrow.i;
3108     ridx = a->compressedrow.rindex;
3109   } else {
3110     mbs = a->mbs;
3111     ii  = a->i;
3112     z   = c;
3113   }
3114 
3115   for (i=0; i<mbs; i++) {
3116     n           = ii[1] - ii[0]; ii++;
3117     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3118     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3119     if (usecprow) z = c + 5*ridx[i];
3120     jj = idx;
3121     vv = v;
3122     for (k=0; k<cn; k++) {
3123       idx = jj;
3124       v = vv;
3125       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
3126       for (j=0; j<n; j++) {
3127         xb    = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm];
3128         sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3129         sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3130         sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3131         sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3132         sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3133         v    += 25;
3134       }
3135       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5;
3136     }
3137     if (!usecprow) z += 5;
3138   }
3139   PetscFunctionReturn(0);
3140 }
3141 
3142 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)
3143 {
3144   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3145   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
3146   Mat_SeqDense      *cd = (Mat_SeqDense*)C->data;
3147   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
3148   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
3149   PetscBLASInt      bbs,bcn,bbm,bcm;
3150   PetscScalar       *z = NULL;
3151   PetscScalar       *c,*b;
3152   const MatScalar   *v;
3153   const PetscInt    *idx,*ii,*ridx=NULL;
3154   PetscScalar       _DZero=0.0,_DOne=1.0;
3155   PetscBool         usecprow=a->compressedrow.use;
3156 
3157   PetscFunctionBegin;
3158   if (!cm || !cn) PetscFunctionReturn(0);
3159   PetscCheck(B->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n);
3160   PetscCheck(A->rmap->n == C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n);
3161   PetscCheck(B->cmap->n == C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n);
3162   b = bd->v;
3163   if (a->nonzerorowcnt != A->rmap->n) {
3164     PetscCall(MatZeroEntries(C));
3165   }
3166   PetscCall(MatDenseGetArray(C,&c));
3167   switch (bs) {
3168   case 1:
3169     PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn));
3170     break;
3171   case 2:
3172     PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn));
3173     break;
3174   case 3:
3175     PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn));
3176     break;
3177   case 4:
3178     PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn));
3179     break;
3180   case 5:
3181     PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn));
3182     break;
3183   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3184     PetscCall(PetscBLASIntCast(bs,&bbs));
3185     PetscCall(PetscBLASIntCast(cn,&bcn));
3186     PetscCall(PetscBLASIntCast(bm,&bbm));
3187     PetscCall(PetscBLASIntCast(cm,&bcm));
3188     idx = a->j;
3189     v   = a->a;
3190     if (usecprow) {
3191       mbs  = a->compressedrow.nrows;
3192       ii   = a->compressedrow.i;
3193       ridx = a->compressedrow.rindex;
3194     } else {
3195       mbs = a->mbs;
3196       ii  = a->i;
3197       z   = c;
3198     }
3199     for (i=0; i<mbs; i++) {
3200       n = ii[1] - ii[0]; ii++;
3201       if (usecprow) z = c + bs*ridx[i];
3202       if (n) {
3203         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm));
3204         v += bs2;
3205       }
3206       for (j=1; j<n; j++) {
3207         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
3208         v += bs2;
3209       }
3210       if (!usecprow) z += bs;
3211     }
3212   }
3213   PetscCall(MatDenseRestoreArray(C,&c));
3214   PetscCall(PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn));
3215   PetscFunctionReturn(0);
3216 }
3217