xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision b94d7ded0a05f1bbd5e48daa6f92b28259c75b44)
149b5e25fSSatish Balay 
2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
3c2916339SPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h>
4c2916339SPierre Jolivet #include <../src/mat/impls/sbaij/seq/sbaij.h>
5af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
6c6db04a5SJed Brown #include <petscbt.h>
7c6db04a5SJed Brown #include <petscblaslapack.h>
849b5e25fSSatish Balay 
913f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
1049b5e25fSSatish Balay {
115eee224dSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
125d0c19d7SBarry Smith   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
135d0c19d7SBarry Smith   const PetscInt *idx;
14db41cccfSHong Zhang   PetscBT        table_out,table_in;
15d94109b8SHong Zhang 
16d94109b8SHong Zhang   PetscFunctionBegin;
1708401ef6SPierre Jolivet   PetscCheck(ov >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
18d94109b8SHong Zhang   mbs  = a->mbs;
19d94109b8SHong Zhang   ai   = a->i;
20d94109b8SHong Zhang   aj   = a->j;
21d0f46423SBarry Smith   bs   = A->rmap->bs;
229566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(mbs,&table_out));
239566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(mbs+1,&nidx));
249566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(A->rmap->N+1,&nidx2));
259566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(mbs,&table_in));
26d94109b8SHong Zhang 
27d94109b8SHong Zhang   for (i=0; i<is_max; i++) { /* for each is */
28d94109b8SHong Zhang     isz  = 0;
299566063dSJacob Faibussowitsch     PetscCall(PetscBTMemzero(mbs,table_out));
30d94109b8SHong Zhang 
31d94109b8SHong Zhang     /* Extract the indices, assume there can be duplicate entries */
329566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(is[i],&idx));
339566063dSJacob Faibussowitsch     PetscCall(ISGetLocalSize(is[i],&n));
34d94109b8SHong Zhang 
35db41cccfSHong Zhang     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
36dbe03f88SHong Zhang     bcol_max = 0;
37d94109b8SHong Zhang     for (j=0; j<n; ++j) {
38d94109b8SHong Zhang       brow = idx[j]/bs; /* convert the indices into block indices */
3908401ef6SPierre Jolivet       PetscCheck(brow < mbs,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
40db41cccfSHong Zhang       if (!PetscBTLookupSet(table_out,brow)) {
41dbe03f88SHong Zhang         nidx[isz++] = brow;
42dbe03f88SHong Zhang         if (bcol_max < brow) bcol_max = brow;
43dbe03f88SHong Zhang       }
44d94109b8SHong Zhang     }
459566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(is[i],&idx));
469566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&is[i]));
47d94109b8SHong Zhang 
48d94109b8SHong Zhang     k = 0;
49d94109b8SHong Zhang     for (j=0; j<ov; j++) { /* for each overlap */
50db41cccfSHong Zhang       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
519566063dSJacob Faibussowitsch       PetscCall(PetscBTMemzero(mbs,table_in));
529566063dSJacob Faibussowitsch       for (l=k; l<isz; l++) PetscCall(PetscBTSet(table_in,nidx[l]));
53d94109b8SHong Zhang 
54d94109b8SHong Zhang       n = isz;  /* length of the updated is[i] */
55d94109b8SHong Zhang       for (brow=0; brow<mbs; brow++) {
56d94109b8SHong Zhang         start = ai[brow]; end   = ai[brow+1];
57db41cccfSHong Zhang         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
58d94109b8SHong Zhang           for (l = start; l<end; l++) {
59d94109b8SHong Zhang             bcol = aj[l];
60d7b97159SHong Zhang             if (!PetscBTLookupSet(table_out,bcol)) {
61d7b97159SHong Zhang               nidx[isz++] = bcol;
62d7b97159SHong Zhang               if (bcol_max < bcol) bcol_max = bcol;
63d7b97159SHong Zhang             }
64d94109b8SHong Zhang           }
65d94109b8SHong Zhang           k++;
66d94109b8SHong Zhang           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67d94109b8SHong Zhang         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
68d94109b8SHong Zhang           for (l = start; l<end; l++) {
69d94109b8SHong Zhang             bcol = aj[l];
70dbe03f88SHong Zhang             if (bcol > bcol_max) break;
71db41cccfSHong Zhang             if (PetscBTLookup(table_in,bcol)) {
7226fbe8dcSKarl Rupp               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
73d94109b8SHong Zhang               break; /* for l = start; l<end ; l++) */
74d94109b8SHong Zhang             }
75d94109b8SHong Zhang           }
76d94109b8SHong Zhang         }
77d94109b8SHong Zhang       }
78d94109b8SHong Zhang     } /* for each overlap */
79d94109b8SHong Zhang 
80d94109b8SHong Zhang     /* expand the Index Set */
81d94109b8SHong Zhang     for (j=0; j<isz; j++) {
8226fbe8dcSKarl Rupp       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
83d94109b8SHong Zhang     }
849566063dSJacob Faibussowitsch     PetscCall(ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i));
85d94109b8SHong Zhang   }
869566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&table_out));
879566063dSJacob Faibussowitsch   PetscCall(PetscFree(nidx));
889566063dSJacob Faibussowitsch   PetscCall(PetscFree(nidx2));
899566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&table_in));
905eee224dSHong Zhang   PetscFunctionReturn(0);
9149b5e25fSSatish Balay }
9249b5e25fSSatish Balay 
93847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
94847daeccSHong Zhang         Zero some ops' to avoid invalid usse */
95847daeccSHong Zhang PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
9649b5e25fSSatish Balay {
9749b5e25fSSatish Balay   PetscFunctionBegin;
989566063dSJacob Faibussowitsch   PetscCall(MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE));
99f4259b30SLisandro Dalcin   Bseq->ops->mult                   = NULL;
100f4259b30SLisandro Dalcin   Bseq->ops->multadd                = NULL;
101f4259b30SLisandro Dalcin   Bseq->ops->multtranspose          = NULL;
102f4259b30SLisandro Dalcin   Bseq->ops->multtransposeadd       = NULL;
103f4259b30SLisandro Dalcin   Bseq->ops->lufactor               = NULL;
104f4259b30SLisandro Dalcin   Bseq->ops->choleskyfactor         = NULL;
105f4259b30SLisandro Dalcin   Bseq->ops->lufactorsymbolic       = NULL;
106f4259b30SLisandro Dalcin   Bseq->ops->choleskyfactorsymbolic = NULL;
107f4259b30SLisandro Dalcin   Bseq->ops->getinertia             = NULL;
10849b5e25fSSatish Balay   PetscFunctionReturn(0);
10949b5e25fSSatish Balay }
11049b5e25fSSatish Balay 
1117dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
1127dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
113e50a8114SHong Zhang {
114e50a8114SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*c;
115e50a8114SHong Zhang   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
116e50a8114SHong Zhang   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
117e50a8114SHong Zhang   const PetscInt *irow,*icol;
118e50a8114SHong Zhang   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
119e50a8114SHong Zhang   PetscInt       *aj = a->j,*ai = a->i;
120e50a8114SHong Zhang   MatScalar      *mat_a;
121e50a8114SHong Zhang   Mat            C;
122e50a8114SHong Zhang   PetscBool      flag;
123e50a8114SHong Zhang 
124e50a8114SHong Zhang   PetscFunctionBegin;
125e50a8114SHong Zhang 
1269566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&irow));
1279566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol,&icol));
1289566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(isrow,&nrows));
1299566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(iscol,&ncols));
130e50a8114SHong Zhang 
1319566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(1+oldcols,&smap));
132e50a8114SHong Zhang   ssmap = smap;
1339566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(1+nrows,&lens));
134e50a8114SHong Zhang   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
135e50a8114SHong Zhang   /* determine lens of each row */
136e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
137e50a8114SHong Zhang     kstart  = ai[irow[i]];
138e50a8114SHong Zhang     kend    = kstart + a->ilen[irow[i]];
139e50a8114SHong Zhang     lens[i] = 0;
140e50a8114SHong Zhang     for (k=kstart; k<kend; k++) {
141e50a8114SHong Zhang       if (ssmap[aj[k]]) lens[i]++;
142e50a8114SHong Zhang     }
143e50a8114SHong Zhang   }
144e50a8114SHong Zhang   /* Create and fill new matrix */
145e50a8114SHong Zhang   if (scall == MAT_REUSE_MATRIX) {
146e50a8114SHong Zhang     c = (Mat_SeqSBAIJ*)((*B)->data);
147e50a8114SHong Zhang 
148aed4548fSBarry Smith     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
1499566063dSJacob Faibussowitsch     PetscCall(PetscArraycmp(c->ilen,lens,c->mbs,&flag));
15028b400f6SJacob Faibussowitsch     PetscCheck(flag,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1519566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(c->ilen,c->mbs));
152e50a8114SHong Zhang     C    = *B;
153e50a8114SHong Zhang   } else {
1549566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&C));
1559566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE));
1569566063dSJacob Faibussowitsch     PetscCall(MatSetType(C,((PetscObject)A)->type_name));
1579566063dSJacob Faibussowitsch     PetscCall(MatSeqSBAIJSetPreallocation(C,bs,0,lens));
158e50a8114SHong Zhang   }
159e50a8114SHong Zhang   c = (Mat_SeqSBAIJ*)(C->data);
160e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
161e50a8114SHong Zhang     row      = irow[i];
162e50a8114SHong Zhang     kstart   = ai[row];
163e50a8114SHong Zhang     kend     = kstart + a->ilen[row];
164e50a8114SHong Zhang     mat_i    = c->i[i];
165e50a8114SHong Zhang     mat_j    = c->j + mat_i;
166e50a8114SHong Zhang     mat_a    = c->a + mat_i*bs2;
167e50a8114SHong Zhang     mat_ilen = c->ilen + i;
168e50a8114SHong Zhang     for (k=kstart; k<kend; k++) {
169e50a8114SHong Zhang       if ((tcol=ssmap[a->j[k]])) {
170e50a8114SHong Zhang         *mat_j++ = tcol - 1;
1719566063dSJacob Faibussowitsch         PetscCall(PetscArraycpy(mat_a,a->a+k*bs2,bs2));
172e50a8114SHong Zhang         mat_a   += bs2;
173e50a8114SHong Zhang         (*mat_ilen)++;
174e50a8114SHong Zhang       }
175e50a8114SHong Zhang     }
176e50a8114SHong Zhang   }
177e50a8114SHong Zhang   /* sort */
178e50a8114SHong Zhang   {
179e50a8114SHong Zhang     MatScalar *work;
180e50a8114SHong Zhang 
1819566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(bs2,&work));
182e50a8114SHong Zhang     for (i=0; i<nrows; i++) {
183e50a8114SHong Zhang       PetscInt ilen;
184e50a8114SHong Zhang       mat_i = c->i[i];
185e50a8114SHong Zhang       mat_j = c->j + mat_i;
186e50a8114SHong Zhang       mat_a = c->a + mat_i*bs2;
187e50a8114SHong Zhang       ilen  = c->ilen[i];
1889566063dSJacob Faibussowitsch       PetscCall(PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work));
189e50a8114SHong Zhang     }
1909566063dSJacob Faibussowitsch     PetscCall(PetscFree(work));
191e50a8114SHong Zhang   }
192e50a8114SHong Zhang 
193e50a8114SHong Zhang   /* Free work space */
1949566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol,&icol));
1959566063dSJacob Faibussowitsch   PetscCall(PetscFree(smap));
1969566063dSJacob Faibussowitsch   PetscCall(PetscFree(lens));
1979566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
1989566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
199e50a8114SHong Zhang 
2009566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&irow));
201e50a8114SHong Zhang   *B   = C;
202e50a8114SHong Zhang   PetscFunctionReturn(0);
203e50a8114SHong Zhang }
204e50a8114SHong Zhang 
2057dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
20649b5e25fSSatish Balay {
207e50a8114SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
208e50a8114SHong Zhang   IS             is1,is2;
209e50a8114SHong Zhang   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
210e50a8114SHong Zhang   const PetscInt *irow,*icol;
21149b5e25fSSatish Balay 
21249b5e25fSSatish Balay   PetscFunctionBegin;
2139566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&irow));
2149566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol,&icol));
2159566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(isrow,&nrows));
2169566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(iscol,&ncols));
217e50a8114SHong Zhang 
218e50a8114SHong Zhang   /* Verify if the indices corespond to each element in a block
219e50a8114SHong Zhang    and form the IS with compressed IS */
220e50a8114SHong Zhang   maxmnbs = PetscMax(a->mbs,a->nbs);
2219566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary));
2229566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(vary,a->mbs));
223e50a8114SHong Zhang   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
224e50a8114SHong Zhang   for (i=0; i<a->mbs; i++) {
225aed4548fSBarry Smith     PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
226e50a8114SHong Zhang   }
227e50a8114SHong Zhang   count = 0;
228e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
229e50a8114SHong Zhang     PetscInt j = irow[i] / bs;
230e50a8114SHong Zhang     if ((vary[j]--)==bs) iary[count++] = j;
231e50a8114SHong Zhang   }
2329566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1));
233e50a8114SHong Zhang 
2349566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(vary,a->nbs));
235e50a8114SHong Zhang   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
236e50a8114SHong Zhang   for (i=0; i<a->nbs; i++) {
237aed4548fSBarry Smith     PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
238e50a8114SHong Zhang   }
239e50a8114SHong Zhang   count = 0;
240e50a8114SHong Zhang   for (i=0; i<ncols; i++) {
241e50a8114SHong Zhang     PetscInt j = icol[i] / bs;
242e50a8114SHong Zhang     if ((vary[j]--)==bs) iary[count++] = j;
243e50a8114SHong Zhang   }
2449566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2));
2459566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&irow));
2469566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol,&icol));
2479566063dSJacob Faibussowitsch   PetscCall(PetscFree2(vary,iary));
248e50a8114SHong Zhang 
2499566063dSJacob Faibussowitsch   PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B));
2509566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is1));
2519566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is2));
252847daeccSHong Zhang 
2538f46ffcaSHong Zhang   if (isrow != iscol) {
2548f46ffcaSHong Zhang     PetscBool isequal;
2559566063dSJacob Faibussowitsch     PetscCall(ISEqual(isrow,iscol,&isequal));
256847daeccSHong Zhang     if (!isequal) {
2579566063dSJacob Faibussowitsch       PetscCall(MatSeqSBAIJZeroOps_Private(*B));
2588f46ffcaSHong Zhang     }
25949b5e25fSSatish Balay   }
26049b5e25fSSatish Balay   PetscFunctionReturn(0);
26149b5e25fSSatish Balay }
26249b5e25fSSatish Balay 
2637dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
26449b5e25fSSatish Balay {
26513f74950SBarry Smith   PetscInt       i;
26649b5e25fSSatish Balay 
26749b5e25fSSatish Balay   PetscFunctionBegin;
268e50a8114SHong Zhang   if (scall == MAT_INITIAL_MATRIX) {
2699566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(n+1,B));
270afebec48SHong Zhang   }
271e50a8114SHong Zhang 
272e50a8114SHong Zhang   for (i=0; i<n; i++) {
2739566063dSJacob Faibussowitsch     PetscCall(MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]));
27449b5e25fSSatish Balay   }
27549b5e25fSSatish Balay   PetscFunctionReturn(0);
27649b5e25fSSatish Balay }
27749b5e25fSSatish Balay 
27849b5e25fSSatish Balay /* -------------------------------------------------------*/
27949b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */
28049b5e25fSSatish Balay /* -------------------------------------------------------*/
28149b5e25fSSatish Balay 
282dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
28349b5e25fSSatish Balay {
28449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
285d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,zero=0.0;
286d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
287d9ca1df4SBarry Smith   const MatScalar   *v;
288d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
289d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
29098c9bda7SSatish Balay   PetscInt          nonzerorow=0;
29149b5e25fSSatish Balay 
29249b5e25fSSatish Balay   PetscFunctionBegin;
2939566063dSJacob Faibussowitsch   PetscCall(VecSet(zz,zero));
294c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
2959566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx,&x));
2969566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz,&z));
29749b5e25fSSatish Balay 
29849b5e25fSSatish Balay   v  = a->a;
29949b5e25fSSatish Balay   xb = x;
30049b5e25fSSatish Balay 
30149b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
30249b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
30349b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
30449b5e25fSSatish Balay     ib          = aj + *ai;
305831a3094SHong Zhang     jmin        = 0;
30698c9bda7SSatish Balay     nonzerorow += (n>0);
3077fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
30849b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
30949b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
310831a3094SHong Zhang       v        += 4; jmin++;
3117fbae186SHong Zhang     }
312444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
313444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
314831a3094SHong Zhang     for (j=jmin; j<n; j++) {
31549b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
31649b5e25fSSatish Balay       cval       = ib[j]*2;
31749b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
31849b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
31949b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
32049b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
32149b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
32249b5e25fSSatish Balay       v        += 4;
32349b5e25fSSatish Balay     }
32449b5e25fSSatish Balay     xb +=2; ai++;
32549b5e25fSSatish Balay   }
32649b5e25fSSatish Balay 
3279566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx,&x));
3289566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz,&z));
3299566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow));
33049b5e25fSSatish Balay   PetscFunctionReturn(0);
33149b5e25fSSatish Balay }
33249b5e25fSSatish Balay 
333dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
33449b5e25fSSatish Balay {
33549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
336d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,zero=0.0;
337d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
338d9ca1df4SBarry Smith   const MatScalar   *v;
339d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
340d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
34198c9bda7SSatish Balay   PetscInt          nonzerorow=0;
34249b5e25fSSatish Balay 
34349b5e25fSSatish Balay   PetscFunctionBegin;
3449566063dSJacob Faibussowitsch   PetscCall(VecSet(zz,zero));
345c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
3469566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx,&x));
3479566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz,&z));
34849b5e25fSSatish Balay 
34949b5e25fSSatish Balay   v  = a->a;
35049b5e25fSSatish Balay   xb = x;
35149b5e25fSSatish Balay 
35249b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
35349b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
35449b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
35549b5e25fSSatish Balay     ib          = aj + *ai;
356831a3094SHong Zhang     jmin        = 0;
35798c9bda7SSatish Balay     nonzerorow += (n>0);
3587fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
35949b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
36049b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
36149b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
362831a3094SHong Zhang       v        += 9; jmin++;
3637fbae186SHong Zhang     }
364444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
365444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
366831a3094SHong Zhang     for (j=jmin; j<n; j++) {
36749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
36849b5e25fSSatish Balay       cval       = ib[j]*3;
36949b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
37049b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
37149b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
37249b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
37349b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
37449b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
37549b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
37649b5e25fSSatish Balay       v        += 9;
37749b5e25fSSatish Balay     }
37849b5e25fSSatish Balay     xb +=3; ai++;
37949b5e25fSSatish Balay   }
38049b5e25fSSatish Balay 
3819566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx,&x));
3829566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz,&z));
3839566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow));
38449b5e25fSSatish Balay   PetscFunctionReturn(0);
38549b5e25fSSatish Balay }
38649b5e25fSSatish Balay 
387dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
38849b5e25fSSatish Balay {
38949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
390d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
391d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
392d9ca1df4SBarry Smith   const MatScalar   *v;
393d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
394d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
39598c9bda7SSatish Balay   PetscInt          nonzerorow = 0;
39649b5e25fSSatish Balay 
39749b5e25fSSatish Balay   PetscFunctionBegin;
3989566063dSJacob Faibussowitsch   PetscCall(VecSet(zz,zero));
399c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
4009566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx,&x));
4019566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz,&z));
40249b5e25fSSatish Balay 
40349b5e25fSSatish Balay   v  = a->a;
40449b5e25fSSatish Balay   xb = x;
40549b5e25fSSatish Balay 
40649b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
40749b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
40849b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
40949b5e25fSSatish Balay     ib          = aj + *ai;
410831a3094SHong Zhang     jmin        = 0;
41198c9bda7SSatish Balay     nonzerorow += (n>0);
4127fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
41349b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
41449b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
41549b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
41649b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
417831a3094SHong Zhang       v        += 16; jmin++;
4187fbae186SHong Zhang     }
419444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
420444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
421831a3094SHong Zhang     for (j=jmin; j<n; j++) {
42249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
42349b5e25fSSatish Balay       cval       = ib[j]*4;
42449b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
42549b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
42649b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
42749b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
42849b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
42949b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
43049b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
43149b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
43249b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
43349b5e25fSSatish Balay       v        += 16;
43449b5e25fSSatish Balay     }
43549b5e25fSSatish Balay     xb +=4; ai++;
43649b5e25fSSatish Balay   }
43749b5e25fSSatish Balay 
4389566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx,&x));
4399566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz,&z));
4409566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow));
44149b5e25fSSatish Balay   PetscFunctionReturn(0);
44249b5e25fSSatish Balay }
44349b5e25fSSatish Balay 
444dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
44549b5e25fSSatish Balay {
44649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
447d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
448d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
449d9ca1df4SBarry Smith   const MatScalar   *v;
450d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
451d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
45298c9bda7SSatish Balay   PetscInt          nonzerorow=0;
45349b5e25fSSatish Balay 
45449b5e25fSSatish Balay   PetscFunctionBegin;
4559566063dSJacob Faibussowitsch   PetscCall(VecSet(zz,zero));
456c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
4579566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx,&x));
4589566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz,&z));
45949b5e25fSSatish Balay 
46049b5e25fSSatish Balay   v  = a->a;
46149b5e25fSSatish Balay   xb = x;
46249b5e25fSSatish Balay 
46349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
46449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
46549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
46649b5e25fSSatish Balay     ib          = aj + *ai;
467831a3094SHong Zhang     jmin        = 0;
46898c9bda7SSatish Balay     nonzerorow += (n>0);
4697fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
47049b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
47149b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
47249b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
47349b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
47449b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
475831a3094SHong Zhang       v        += 25; jmin++;
4767fbae186SHong Zhang     }
477444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
478444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
479831a3094SHong Zhang     for (j=jmin; j<n; j++) {
48049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
48149b5e25fSSatish Balay       cval       = ib[j]*5;
48249b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
48349b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
48449b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
48549b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
48649b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
48749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
48849b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
48949b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
49049b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
49149b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
49249b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
49349b5e25fSSatish Balay       v        += 25;
49449b5e25fSSatish Balay     }
49549b5e25fSSatish Balay     xb +=5; ai++;
49649b5e25fSSatish Balay   }
49749b5e25fSSatish Balay 
4989566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx,&x));
4999566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz,&z));
5009566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow));
50149b5e25fSSatish Balay   PetscFunctionReturn(0);
50249b5e25fSSatish Balay }
50349b5e25fSSatish Balay 
504dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
50549b5e25fSSatish Balay {
50649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
507d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
508d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
509d9ca1df4SBarry Smith   const MatScalar   *v;
510d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
511d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
51298c9bda7SSatish Balay   PetscInt          nonzerorow=0;
51349b5e25fSSatish Balay 
51449b5e25fSSatish Balay   PetscFunctionBegin;
5159566063dSJacob Faibussowitsch   PetscCall(VecSet(zz,zero));
516c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
5179566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx,&x));
5189566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz,&z));
51949b5e25fSSatish Balay 
52049b5e25fSSatish Balay   v  = a->a;
52149b5e25fSSatish Balay   xb = x;
52249b5e25fSSatish Balay 
52349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
52449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
52549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
52649b5e25fSSatish Balay     ib          = aj + *ai;
527831a3094SHong Zhang     jmin        = 0;
52898c9bda7SSatish Balay     nonzerorow += (n>0);
5297fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
53049b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
53149b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
53249b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
53349b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
53449b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
53549b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
536831a3094SHong Zhang       v        += 36; jmin++;
5377fbae186SHong Zhang     }
538444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
539444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
540831a3094SHong Zhang     for (j=jmin; j<n; j++) {
54149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
54249b5e25fSSatish Balay       cval       = ib[j]*6;
54349b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
54449b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
54549b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
54649b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
54749b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
54849b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
54949b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
55049b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
55149b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
55249b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
55349b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
55449b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
55549b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
55649b5e25fSSatish Balay       v        += 36;
55749b5e25fSSatish Balay     }
55849b5e25fSSatish Balay     xb +=6; ai++;
55949b5e25fSSatish Balay   }
56049b5e25fSSatish Balay 
5619566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx,&x));
5629566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz,&z));
5639566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow));
56449b5e25fSSatish Balay   PetscFunctionReturn(0);
56549b5e25fSSatish Balay }
566c2916339SPierre Jolivet 
567dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
56849b5e25fSSatish Balay {
56949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
570d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
571d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
572d9ca1df4SBarry Smith   const MatScalar   *v;
573d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
574d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
57598c9bda7SSatish Balay   PetscInt          nonzerorow=0;
57649b5e25fSSatish Balay 
57749b5e25fSSatish Balay   PetscFunctionBegin;
5789566063dSJacob Faibussowitsch   PetscCall(VecSet(zz,zero));
579c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
5809566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx,&x));
5819566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz,&z));
58249b5e25fSSatish Balay 
58349b5e25fSSatish Balay   v  = a->a;
58449b5e25fSSatish Balay   xb = x;
58549b5e25fSSatish Balay 
58649b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
58749b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
58849b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
58949b5e25fSSatish Balay     ib          = aj + *ai;
590831a3094SHong Zhang     jmin        = 0;
59198c9bda7SSatish Balay     nonzerorow += (n>0);
5927fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
59349b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
59449b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
59549b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
59649b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
59749b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
59849b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
59949b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
600831a3094SHong Zhang       v        += 49; jmin++;
6017fbae186SHong Zhang     }
602444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
603444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
604831a3094SHong Zhang     for (j=jmin; j<n; j++) {
60549b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
60649b5e25fSSatish Balay       cval       = ib[j]*7;
60749b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
60849b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
60949b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
61049b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
61149b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
61249b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
61349b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
61449b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
61549b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
61649b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
61749b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
61849b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
61949b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
62049b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
62149b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
62249b5e25fSSatish Balay       v       += 49;
62349b5e25fSSatish Balay     }
62449b5e25fSSatish Balay     xb +=7; ai++;
62549b5e25fSSatish Balay   }
6269566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx,&x));
6279566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz,&z));
6289566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow));
62949b5e25fSSatish Balay   PetscFunctionReturn(0);
63049b5e25fSSatish Balay }
63149b5e25fSSatish Balay 
63249b5e25fSSatish Balay /*
63349b5e25fSSatish Balay     This will not work with MatScalar == float because it calls the BLAS
63449b5e25fSSatish Balay */
635dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
63649b5e25fSSatish Balay {
63749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
638d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
639d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
640d9ca1df4SBarry Smith   const MatScalar   *v;
641d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
642d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
64398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
64449b5e25fSSatish Balay 
64549b5e25fSSatish Balay   PetscFunctionBegin;
6469566063dSJacob Faibussowitsch   PetscCall(VecSet(zz,zero));
647c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
6489566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx,&x));
6499566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz,&z));
65059689ffbSStefano Zampini 
65159689ffbSStefano Zampini   x_ptr = x;
65259689ffbSStefano Zampini   z_ptr = z;
65349b5e25fSSatish Balay 
65449b5e25fSSatish Balay   aj = a->j;
65549b5e25fSSatish Balay   v  = a->a;
65649b5e25fSSatish Balay   ii = a->i;
65749b5e25fSSatish Balay 
65849b5e25fSSatish Balay   if (!a->mult_work) {
6599566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(A->rmap->N+1,&a->mult_work));
66049b5e25fSSatish Balay   }
66149b5e25fSSatish Balay   work = a->mult_work;
66249b5e25fSSatish Balay 
66349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
66449b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
66549b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
66698c9bda7SSatish Balay     nonzerorow += (n>0);
66749b5e25fSSatish Balay 
66849b5e25fSSatish Balay     /* upper triangular part */
66949b5e25fSSatish Balay     for (j=0; j<n; j++) {
67049b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
67149b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
67249b5e25fSSatish Balay       workt += bs;
67349b5e25fSSatish Balay     }
67449b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
67596b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
67649b5e25fSSatish Balay 
67749b5e25fSSatish Balay     /* strict lower triangular part */
678831a3094SHong Zhang     idx = aj+ii[0];
67959689ffbSStefano Zampini     if (n && *idx == i) {
68096b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
681831a3094SHong Zhang     }
68296b9376eSHong Zhang 
68349b5e25fSSatish Balay     if (ncols > 0) {
68449b5e25fSSatish Balay       workt = work;
6859566063dSJacob Faibussowitsch       PetscCall(PetscArrayzero(workt,ncols));
68696b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
687831a3094SHong Zhang       for (j=0; j<n; j++) {
688831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
68949b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
69049b5e25fSSatish Balay         workt += bs;
69149b5e25fSSatish Balay       }
69249b5e25fSSatish Balay     }
69349b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
69449b5e25fSSatish Balay   }
69549b5e25fSSatish Balay 
6969566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx,&x));
6979566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz,&z));
6989566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow));
69949b5e25fSSatish Balay   PetscFunctionReturn(0);
70049b5e25fSSatish Balay }
70149b5e25fSSatish Balay 
702dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
70349b5e25fSSatish Balay {
70449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
705d9ca1df4SBarry Smith   PetscScalar       *z,x1;
706d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
707d9ca1df4SBarry Smith   const MatScalar   *v;
708d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
709d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
71098c9bda7SSatish Balay   PetscInt          nonzerorow=0;
711eb1ec7c1SStefano Zampini #if defined(PETSC_USE_COMPLEX)
712*b94d7dedSBarry Smith   const int         aconj = A->hermitian == PETSC_BOOL3_TRUE;
713eb1ec7c1SStefano Zampini #else
714eb1ec7c1SStefano Zampini   const int         aconj = 0;
715eb1ec7c1SStefano Zampini #endif
71649b5e25fSSatish Balay 
71749b5e25fSSatish Balay   PetscFunctionBegin;
7189566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy,zz));
719c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
7209566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx,&x));
7219566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz,&z));
72249b5e25fSSatish Balay   v    = a->a;
72349b5e25fSSatish Balay   xb   = x;
72449b5e25fSSatish Balay 
72549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
72649b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th row of A */
72749b5e25fSSatish Balay     x1          = xb[0];
72849b5e25fSSatish Balay     ib          = aj + *ai;
729831a3094SHong Zhang     jmin        = 0;
73098c9bda7SSatish Balay     nonzerorow += (n>0);
7313d9ade75SStefano Zampini     if (n && *ib == i) {            /* (diag of A)*x */
732831a3094SHong Zhang       z[i] += *v++ * x[*ib++]; jmin++;
733831a3094SHong Zhang     }
734eb1ec7c1SStefano Zampini     if (aconj) {
735eb1ec7c1SStefano Zampini       for (j=jmin; j<n; j++) {
736eb1ec7c1SStefano Zampini         cval    = *ib;
737eb1ec7c1SStefano Zampini         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
738eb1ec7c1SStefano Zampini         z[i]    += *v++ * x[*ib++];    /* (strict upper triangular part of A)*x  */
739eb1ec7c1SStefano Zampini       }
740eb1ec7c1SStefano Zampini     } else {
741831a3094SHong Zhang       for (j=jmin; j<n; j++) {
74249b5e25fSSatish Balay         cval    = *ib;
74349b5e25fSSatish Balay         z[cval] += *v * x1;         /* (strict lower triangular part of A)*x  */
74449b5e25fSSatish Balay         z[i]    += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
74549b5e25fSSatish Balay       }
746eb1ec7c1SStefano Zampini     }
74749b5e25fSSatish Balay     xb++; ai++;
74849b5e25fSSatish Balay   }
74949b5e25fSSatish Balay 
7509566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx,&x));
7519566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz,&z));
75249b5e25fSSatish Balay 
7539566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)));
75449b5e25fSSatish Balay   PetscFunctionReturn(0);
75549b5e25fSSatish Balay }
75649b5e25fSSatish Balay 
757dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
75849b5e25fSSatish Balay {
75949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
760d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2;
761d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
762d9ca1df4SBarry Smith   const MatScalar   *v;
763d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
764d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
76598c9bda7SSatish Balay   PetscInt          nonzerorow=0;
76649b5e25fSSatish Balay 
76749b5e25fSSatish Balay   PetscFunctionBegin;
7689566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy,zz));
769c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
7709566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx,&x));
7719566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz,&z));
77249b5e25fSSatish Balay 
77349b5e25fSSatish Balay   v  = a->a;
77449b5e25fSSatish Balay   xb = x;
77549b5e25fSSatish Balay 
77649b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
77749b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
77849b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
77949b5e25fSSatish Balay     ib          = aj + *ai;
780831a3094SHong Zhang     jmin        = 0;
78198c9bda7SSatish Balay     nonzerorow += (n>0);
78259689ffbSStefano Zampini     if (n && *ib == i) {      /* (diag of A)*x */
78349b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
78449b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
785831a3094SHong Zhang       v        += 4; jmin++;
7867fbae186SHong Zhang     }
787444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
788444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
789831a3094SHong Zhang     for (j=jmin; j<n; j++) {
79049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
79149b5e25fSSatish Balay       cval       = ib[j]*2;
79249b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
79349b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
79449b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
79549b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
79649b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
79749b5e25fSSatish Balay       v        += 4;
79849b5e25fSSatish Balay     }
79949b5e25fSSatish Balay     xb +=2; ai++;
80049b5e25fSSatish Balay   }
8019566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx,&x));
8029566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz,&z));
80349b5e25fSSatish Balay 
8049566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow)));
80549b5e25fSSatish Balay   PetscFunctionReturn(0);
80649b5e25fSSatish Balay }
80749b5e25fSSatish Balay 
808dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
80949b5e25fSSatish Balay {
81049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
811d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3;
812d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
813d9ca1df4SBarry Smith   const MatScalar   *v;
814d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
815d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
81698c9bda7SSatish Balay   PetscInt          nonzerorow=0;
81749b5e25fSSatish Balay 
81849b5e25fSSatish Balay   PetscFunctionBegin;
8199566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy,zz));
820c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
8219566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx,&x));
8229566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz,&z));
82349b5e25fSSatish Balay 
82449b5e25fSSatish Balay   v  = a->a;
82549b5e25fSSatish Balay   xb = x;
82649b5e25fSSatish Balay 
82749b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
82849b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
82949b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
83049b5e25fSSatish Balay     ib          = aj + *ai;
831831a3094SHong Zhang     jmin        = 0;
83298c9bda7SSatish Balay     nonzerorow += (n>0);
83359689ffbSStefano Zampini     if (n && *ib == i) {     /* (diag of A)*x */
83449b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
83549b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
83649b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
837831a3094SHong Zhang       v        += 9; jmin++;
8387fbae186SHong Zhang     }
839444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
840444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
841831a3094SHong Zhang     for (j=jmin; j<n; j++) {
84249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
84349b5e25fSSatish Balay       cval       = ib[j]*3;
84449b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
84549b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
84649b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
84749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
84849b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
84949b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
85049b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
85149b5e25fSSatish Balay       v        += 9;
85249b5e25fSSatish Balay     }
85349b5e25fSSatish Balay     xb +=3; ai++;
85449b5e25fSSatish Balay   }
85549b5e25fSSatish Balay 
8569566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx,&x));
8579566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz,&z));
85849b5e25fSSatish Balay 
8599566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow)));
86049b5e25fSSatish Balay   PetscFunctionReturn(0);
86149b5e25fSSatish Balay }
86249b5e25fSSatish Balay 
863dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
86449b5e25fSSatish Balay {
86549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
866d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4;
867d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
868d9ca1df4SBarry Smith   const MatScalar   *v;
869d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
870d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
87198c9bda7SSatish Balay   PetscInt          nonzerorow=0;
87249b5e25fSSatish Balay 
87349b5e25fSSatish Balay   PetscFunctionBegin;
8749566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy,zz));
875c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
8769566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx,&x));
8779566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz,&z));
87849b5e25fSSatish Balay 
87949b5e25fSSatish Balay   v  = a->a;
88049b5e25fSSatish Balay   xb = x;
88149b5e25fSSatish Balay 
88249b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
88349b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
88449b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
88549b5e25fSSatish Balay     ib          = aj + *ai;
886831a3094SHong Zhang     jmin        = 0;
88798c9bda7SSatish Balay     nonzerorow += (n>0);
88859689ffbSStefano Zampini     if (n && *ib == i) {      /* (diag of A)*x */
88949b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
89049b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
89149b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
89249b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
893831a3094SHong Zhang       v        += 16; jmin++;
8947fbae186SHong Zhang     }
895444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
896444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
897831a3094SHong Zhang     for (j=jmin; j<n; j++) {
89849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
89949b5e25fSSatish Balay       cval       = ib[j]*4;
90049b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
90149b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
90249b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
90349b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
90449b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
90549b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
90649b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
90749b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
90849b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
90949b5e25fSSatish Balay       v        += 16;
91049b5e25fSSatish Balay     }
91149b5e25fSSatish Balay     xb +=4; ai++;
91249b5e25fSSatish Balay   }
91349b5e25fSSatish Balay 
9149566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx,&x));
9159566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz,&z));
91649b5e25fSSatish Balay 
9179566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow)));
91849b5e25fSSatish Balay   PetscFunctionReturn(0);
91949b5e25fSSatish Balay }
92049b5e25fSSatish Balay 
921dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
92249b5e25fSSatish Balay {
92349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
924d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5;
925d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
926d9ca1df4SBarry Smith   const MatScalar   *v;
927d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
928d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
92998c9bda7SSatish Balay   PetscInt          nonzerorow=0;
93049b5e25fSSatish Balay 
93149b5e25fSSatish Balay   PetscFunctionBegin;
9329566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy,zz));
933c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
9349566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx,&x));
9359566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz,&z));
93649b5e25fSSatish Balay 
93749b5e25fSSatish Balay   v  = a->a;
93849b5e25fSSatish Balay   xb = x;
93949b5e25fSSatish Balay 
94049b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
94149b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
94249b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
94349b5e25fSSatish Balay     ib          = aj + *ai;
944831a3094SHong Zhang     jmin        = 0;
94598c9bda7SSatish Balay     nonzerorow += (n>0);
94659689ffbSStefano Zampini     if (n && *ib == i) {      /* (diag of A)*x */
94749b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
94849b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
94949b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
95049b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
95149b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
952831a3094SHong Zhang       v        += 25; jmin++;
9537fbae186SHong Zhang     }
954444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
955444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
956831a3094SHong Zhang     for (j=jmin; j<n; j++) {
95749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
95849b5e25fSSatish Balay       cval       = ib[j]*5;
95949b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
96049b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
96149b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
96249b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
96349b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
96449b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
96549b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
96649b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
96749b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
96849b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
96949b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
97049b5e25fSSatish Balay       v        += 25;
97149b5e25fSSatish Balay     }
97249b5e25fSSatish Balay     xb +=5; ai++;
97349b5e25fSSatish Balay   }
97449b5e25fSSatish Balay 
9759566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx,&x));
9769566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz,&z));
97749b5e25fSSatish Balay 
9789566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow)));
97949b5e25fSSatish Balay   PetscFunctionReturn(0);
98049b5e25fSSatish Balay }
981c2916339SPierre Jolivet 
982dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
98349b5e25fSSatish Balay {
98449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
985d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
986d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
987d9ca1df4SBarry Smith   const MatScalar   *v;
988d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
989d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
99098c9bda7SSatish Balay   PetscInt          nonzerorow=0;
99149b5e25fSSatish Balay 
99249b5e25fSSatish Balay   PetscFunctionBegin;
9939566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy,zz));
994c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
9959566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx,&x));
9969566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz,&z));
99749b5e25fSSatish Balay 
99849b5e25fSSatish Balay   v  = a->a;
99949b5e25fSSatish Balay   xb = x;
100049b5e25fSSatish Balay 
100149b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
100249b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
100349b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
100449b5e25fSSatish Balay     ib          = aj + *ai;
1005831a3094SHong Zhang     jmin        = 0;
100698c9bda7SSatish Balay     nonzerorow += (n>0);
100759689ffbSStefano Zampini     if (n && *ib == i) {     /* (diag of A)*x */
100849b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
100949b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
101049b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
101149b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
101249b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
101349b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1014831a3094SHong Zhang       v        += 36; jmin++;
10157fbae186SHong Zhang     }
1016444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1017444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1018831a3094SHong Zhang     for (j=jmin; j<n; j++) {
101949b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
102049b5e25fSSatish Balay       cval       = ib[j]*6;
102149b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
102249b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
102349b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
102449b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
102549b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
102649b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
102749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
102849b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
102949b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
103049b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
103149b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
103249b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
103349b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
103449b5e25fSSatish Balay       v        += 36;
103549b5e25fSSatish Balay     }
103649b5e25fSSatish Balay     xb +=6; ai++;
103749b5e25fSSatish Balay   }
103849b5e25fSSatish Balay 
10399566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx,&x));
10409566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz,&z));
104149b5e25fSSatish Balay 
10429566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow)));
104349b5e25fSSatish Balay   PetscFunctionReturn(0);
104449b5e25fSSatish Balay }
104549b5e25fSSatish Balay 
1046dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
104749b5e25fSSatish Balay {
104849b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1049d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1050d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
1051d9ca1df4SBarry Smith   const MatScalar   *v;
1052d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1053d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
105498c9bda7SSatish Balay   PetscInt          nonzerorow=0;
105549b5e25fSSatish Balay 
105649b5e25fSSatish Balay   PetscFunctionBegin;
10579566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy,zz));
1058c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
10599566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx,&x));
10609566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz,&z));
106149b5e25fSSatish Balay 
106249b5e25fSSatish Balay   v  = a->a;
106349b5e25fSSatish Balay   xb = x;
106449b5e25fSSatish Balay 
106549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
106649b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
106749b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
106849b5e25fSSatish Balay     ib          = aj + *ai;
1069831a3094SHong Zhang     jmin        = 0;
107098c9bda7SSatish Balay     nonzerorow += (n>0);
107159689ffbSStefano Zampini     if (n && *ib == i) {     /* (diag of A)*x */
107249b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
107349b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
107449b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
107549b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
107649b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
107749b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
107849b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1079831a3094SHong Zhang       v        += 49; jmin++;
10807fbae186SHong Zhang     }
1081444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1082444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1083831a3094SHong Zhang     for (j=jmin; j<n; j++) {
108449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
108549b5e25fSSatish Balay       cval       = ib[j]*7;
108649b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
108749b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
108849b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
108949b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
109049b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
109149b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
109249b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
109349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
109449b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
109549b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
109649b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
109749b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
109849b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
109949b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
110049b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
110149b5e25fSSatish Balay       v       += 49;
110249b5e25fSSatish Balay     }
110349b5e25fSSatish Balay     xb +=7; ai++;
110449b5e25fSSatish Balay   }
110549b5e25fSSatish Balay 
11069566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx,&x));
11079566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz,&z));
110849b5e25fSSatish Balay 
11099566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow)));
111049b5e25fSSatish Balay   PetscFunctionReturn(0);
111149b5e25fSSatish Balay }
111249b5e25fSSatish Balay 
1113dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
111449b5e25fSSatish Balay {
111549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1116f4259b30SLisandro Dalcin   PetscScalar       *z,*z_ptr=NULL,*zb,*work,*workt;
1117d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
1118d9ca1df4SBarry Smith   const MatScalar   *v;
1119d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1120d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
112198c9bda7SSatish Balay   PetscInt          nonzerorow=0;
112249b5e25fSSatish Balay 
112349b5e25fSSatish Balay   PetscFunctionBegin;
11249566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy,zz));
1125c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
11269566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx,&x)); x_ptr=x;
11279566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz,&z)); z_ptr=z;
112849b5e25fSSatish Balay 
112949b5e25fSSatish Balay   aj = a->j;
113049b5e25fSSatish Balay   v  = a->a;
113149b5e25fSSatish Balay   ii = a->i;
113249b5e25fSSatish Balay 
113349b5e25fSSatish Balay   if (!a->mult_work) {
11349566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(A->rmap->n+1,&a->mult_work));
113549b5e25fSSatish Balay   }
113649b5e25fSSatish Balay   work = a->mult_work;
113749b5e25fSSatish Balay 
113849b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
113949b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
114049b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
114198c9bda7SSatish Balay     nonzerorow += (n>0);
114249b5e25fSSatish Balay 
114349b5e25fSSatish Balay     /* upper triangular part */
114449b5e25fSSatish Balay     for (j=0; j<n; j++) {
114549b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
114649b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
114749b5e25fSSatish Balay       workt += bs;
114849b5e25fSSatish Balay     }
114949b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
115096b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
115149b5e25fSSatish Balay 
115249b5e25fSSatish Balay     /* strict lower triangular part */
1153831a3094SHong Zhang     idx = aj+ii[0];
115459689ffbSStefano Zampini     if (n && *idx == i) {
115596b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
1156831a3094SHong Zhang     }
115749b5e25fSSatish Balay     if (ncols > 0) {
115849b5e25fSSatish Balay       workt = work;
11599566063dSJacob Faibussowitsch       PetscCall(PetscArrayzero(workt,ncols));
116096b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1161831a3094SHong Zhang       for (j=0; j<n; j++) {
1162831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
116349b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
116449b5e25fSSatish Balay         workt += bs;
116549b5e25fSSatish Balay       }
116649b5e25fSSatish Balay     }
116749b5e25fSSatish Balay 
116849b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
116949b5e25fSSatish Balay   }
117049b5e25fSSatish Balay 
11719566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx,&x));
11729566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz,&z));
117349b5e25fSSatish Balay 
11749566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)));
117549b5e25fSSatish Balay   PetscFunctionReturn(0);
117649b5e25fSSatish Balay }
117749b5e25fSSatish Balay 
1178f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
117949b5e25fSSatish Balay {
118049b5e25fSSatish Balay   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1181f4df32b1SMatthew Knepley   PetscScalar    oalpha = alpha;
1182c5df96a5SBarry Smith   PetscBLASInt   one = 1,totalnz;
118349b5e25fSSatish Balay 
118449b5e25fSSatish Balay   PetscFunctionBegin;
11859566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(a->bs2*a->nz,&totalnz));
11868b83055fSJed Brown   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
11879566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(totalnz));
118849b5e25fSSatish Balay   PetscFunctionReturn(0);
118949b5e25fSSatish Balay }
119049b5e25fSSatish Balay 
1191dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
119249b5e25fSSatish Balay {
119349b5e25fSSatish Balay   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1194d9ca1df4SBarry Smith   const MatScalar *v       = a->a;
119549b5e25fSSatish Balay   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1196d9ca1df4SBarry Smith   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1197d9ca1df4SBarry Smith   const PetscInt  *aj=a->j,*col;
119849b5e25fSSatish Balay 
119949b5e25fSSatish Balay   PetscFunctionBegin;
1200c40ae873SPierre Jolivet   if (!a->nz) {
1201c40ae873SPierre Jolivet     *norm = 0.0;
1202c40ae873SPierre Jolivet     PetscFunctionReturn(0);
1203c40ae873SPierre Jolivet   }
120449b5e25fSSatish Balay   if (type == NORM_FROBENIUS) {
120549b5e25fSSatish Balay     for (k=0; k<mbs; k++) {
120659689ffbSStefano Zampini       jmin = a->i[k];
120759689ffbSStefano Zampini       jmax = a->i[k+1];
1208831a3094SHong Zhang       col  = aj + jmin;
120959689ffbSStefano Zampini       if (jmax-jmin > 0 && *col == k) {         /* diagonal block */
121049b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
121149b5e25fSSatish Balay           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
121249b5e25fSSatish Balay         }
1213831a3094SHong Zhang         jmin++;
1214831a3094SHong Zhang       }
1215831a3094SHong Zhang       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
121649b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
121749b5e25fSSatish Balay           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
121849b5e25fSSatish Balay         }
121949b5e25fSSatish Balay       }
122049b5e25fSSatish Balay     }
12218f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
12229566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0*bs2*a->nz));
12230b8dc8d2SHong Zhang   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
12249566063dSJacob Faibussowitsch     PetscCall(PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl));
12250b8dc8d2SHong Zhang     for (i=0; i<mbs; i++) jl[i] = mbs;
12260b8dc8d2SHong Zhang     il[0] = 0;
122749b5e25fSSatish Balay 
122849b5e25fSSatish Balay     *norm = 0.0;
122949b5e25fSSatish Balay     for (k=0; k<mbs; k++) { /* k_th block row */
123049b5e25fSSatish Balay       for (j=0; j<bs; j++) sum[j]=0.0;
123149b5e25fSSatish Balay       /*-- col sum --*/
123249b5e25fSSatish Balay       i = jl[k]; /* first |A(i,k)| to be added */
1233831a3094SHong Zhang       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1234831a3094SHong Zhang                   at step k */
123549b5e25fSSatish Balay       while (i<mbs) {
123649b5e25fSSatish Balay         nexti = jl[i];  /* next block row to be added */
123749b5e25fSSatish Balay         ik    = il[i];  /* block index of A(i,k) in the array a */
123849b5e25fSSatish Balay         for (j=0; j<bs; j++) {
123949b5e25fSSatish Balay           v = a->a + ik*bs2 + j*bs;
124049b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
124149b5e25fSSatish Balay             sum[j] += PetscAbsScalar(*v); v++;
124249b5e25fSSatish Balay           }
124349b5e25fSSatish Balay         }
124449b5e25fSSatish Balay         /* update il, jl */
1245831a3094SHong Zhang         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1246831a3094SHong Zhang         jmax = a->i[i+1];
124749b5e25fSSatish Balay         if (jmin < jmax) {
124849b5e25fSSatish Balay           il[i] = jmin;
124949b5e25fSSatish Balay           j     = a->j[jmin];
125049b5e25fSSatish Balay           jl[i] = jl[j]; jl[j]=i;
125149b5e25fSSatish Balay         }
125249b5e25fSSatish Balay         i = nexti;
125349b5e25fSSatish Balay       }
125449b5e25fSSatish Balay       /*-- row sum --*/
125559689ffbSStefano Zampini       jmin = a->i[k];
125659689ffbSStefano Zampini       jmax = a->i[k+1];
125749b5e25fSSatish Balay       for (i=jmin; i<jmax; i++) {
125849b5e25fSSatish Balay         for (j=0; j<bs; j++) {
125949b5e25fSSatish Balay           v = a->a + i*bs2 + j;
126049b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
12610b8dc8d2SHong Zhang             sum[j] += PetscAbsScalar(*v); v += bs;
126249b5e25fSSatish Balay           }
126349b5e25fSSatish Balay         }
126449b5e25fSSatish Balay       }
126549b5e25fSSatish Balay       /* add k_th block row to il, jl */
1266831a3094SHong Zhang       col = aj+jmin;
126759689ffbSStefano Zampini       if (jmax - jmin > 0 && *col == k) jmin++;
126849b5e25fSSatish Balay       if (jmin < jmax) {
126949b5e25fSSatish Balay         il[k] = jmin;
12700b8dc8d2SHong Zhang         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
127149b5e25fSSatish Balay       }
127249b5e25fSSatish Balay       for (j=0; j<bs; j++) {
127349b5e25fSSatish Balay         if (sum[j] > *norm) *norm = sum[j];
127449b5e25fSSatish Balay       }
127549b5e25fSSatish Balay     }
12769566063dSJacob Faibussowitsch     PetscCall(PetscFree3(sum,il,jl));
12779566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(PetscMax(mbs*a->nz-1,0)));
1278f23aa3ddSBarry Smith   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
127949b5e25fSSatish Balay   PetscFunctionReturn(0);
128049b5e25fSSatish Balay }
128149b5e25fSSatish Balay 
1282ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
128349b5e25fSSatish Balay {
128449b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
128549b5e25fSSatish Balay 
128649b5e25fSSatish Balay   PetscFunctionBegin;
128749b5e25fSSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1288d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1289ef511fbeSHong Zhang     *flg = PETSC_FALSE;
1290ef511fbeSHong Zhang     PetscFunctionReturn(0);
129149b5e25fSSatish Balay   }
129249b5e25fSSatish Balay 
129349b5e25fSSatish Balay   /* if the a->i are the same */
12949566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->i,b->i,a->mbs+1,flg));
129526fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
129649b5e25fSSatish Balay 
129749b5e25fSSatish Balay   /* if a->j are the same */
12989566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->j,b->j,a->nz,flg));
129926fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
130026fbe8dcSKarl Rupp 
130149b5e25fSSatish Balay   /* if a->a are the same */
13029566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg));
1303935af2e7SHong Zhang   PetscFunctionReturn(0);
130449b5e25fSSatish Balay }
130549b5e25fSSatish Balay 
1306dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
130749b5e25fSSatish Balay {
130849b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1309d9ca1df4SBarry Smith   PetscInt        i,j,k,row,bs,ambs,bs2;
1310d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
131187828ca2SBarry Smith   PetscScalar     *x,zero = 0.0;
1312d9ca1df4SBarry Smith   const MatScalar *aa,*aa_j;
131349b5e25fSSatish Balay 
131449b5e25fSSatish Balay   PetscFunctionBegin;
1315d0f46423SBarry Smith   bs = A->rmap->bs;
1316aed4548fSBarry Smith   PetscCheck(!A->factortype || bs <= 1,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
131782799104SHong Zhang 
131849b5e25fSSatish Balay   aa   = a->a;
13198a0c6efdSHong Zhang   ambs = a->mbs;
13208a0c6efdSHong Zhang 
13218a0c6efdSHong Zhang   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
13228a0c6efdSHong Zhang     PetscInt *diag=a->diag;
13238a0c6efdSHong Zhang     aa   = a->a;
13248a0c6efdSHong Zhang     ambs = a->mbs;
13259566063dSJacob Faibussowitsch     PetscCall(VecGetArray(v,&x));
13268a0c6efdSHong Zhang     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
13279566063dSJacob Faibussowitsch     PetscCall(VecRestoreArray(v,&x));
13288a0c6efdSHong Zhang     PetscFunctionReturn(0);
13298a0c6efdSHong Zhang   }
13308a0c6efdSHong Zhang 
133149b5e25fSSatish Balay   ai   = a->i;
133249b5e25fSSatish Balay   aj   = a->j;
133349b5e25fSSatish Balay   bs2  = a->bs2;
13349566063dSJacob Faibussowitsch   PetscCall(VecSet(v,zero));
1335c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
13369566063dSJacob Faibussowitsch   PetscCall(VecGetArray(v,&x));
133749b5e25fSSatish Balay   for (i=0; i<ambs; i++) {
133849b5e25fSSatish Balay     j = ai[i];
133949b5e25fSSatish Balay     if (aj[j] == i) {    /* if this is a diagonal element */
134049b5e25fSSatish Balay       row  = i*bs;
134149b5e25fSSatish Balay       aa_j = aa + j*bs2;
134249b5e25fSSatish Balay       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
134349b5e25fSSatish Balay     }
134449b5e25fSSatish Balay   }
13459566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(v,&x));
134649b5e25fSSatish Balay   PetscFunctionReturn(0);
134749b5e25fSSatish Balay }
134849b5e25fSSatish Balay 
1349dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
135049b5e25fSSatish Balay {
135149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1352d9ca1df4SBarry Smith   PetscScalar       x;
1353d9ca1df4SBarry Smith   const PetscScalar *l,*li,*ri;
135449b5e25fSSatish Balay   MatScalar         *aa,*v;
1355fff8e43fSBarry Smith   PetscInt          i,j,k,lm,M,m,mbs,tmp,bs,bs2;
1356fff8e43fSBarry Smith   const PetscInt    *ai,*aj;
1357ace3abfcSBarry Smith   PetscBool         flg;
135849b5e25fSSatish Balay 
135949b5e25fSSatish Balay   PetscFunctionBegin;
1360b3bf805bSHong Zhang   if (ll != rr) {
13619566063dSJacob Faibussowitsch     PetscCall(VecEqual(ll,rr,&flg));
136228b400f6SJacob Faibussowitsch     PetscCheck(flg,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same");
1363b3bf805bSHong Zhang   }
1364b3bf805bSHong Zhang   if (!ll) PetscFunctionReturn(0);
136549b5e25fSSatish Balay   ai  = a->i;
136649b5e25fSSatish Balay   aj  = a->j;
136749b5e25fSSatish Balay   aa  = a->a;
1368d0f46423SBarry Smith   m   = A->rmap->N;
1369d0f46423SBarry Smith   bs  = A->rmap->bs;
137049b5e25fSSatish Balay   mbs = a->mbs;
137149b5e25fSSatish Balay   bs2 = a->bs2;
137249b5e25fSSatish Balay 
13739566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(ll,&l));
13749566063dSJacob Faibussowitsch   PetscCall(VecGetLocalSize(ll,&lm));
137508401ef6SPierre Jolivet   PetscCheck(lm == m,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
137649b5e25fSSatish Balay   for (i=0; i<mbs; i++) { /* for each block row */
137749b5e25fSSatish Balay     M  = ai[i+1] - ai[i];
137849b5e25fSSatish Balay     li = l + i*bs;
137949b5e25fSSatish Balay     v  = aa + bs2*ai[i];
138049b5e25fSSatish Balay     for (j=0; j<M; j++) { /* for each block */
138149b5e25fSSatish Balay       ri = l + bs*aj[ai[i]+j];
13825e90f9d9SHong Zhang       for (k=0; k<bs; k++) {
138349b5e25fSSatish Balay         x = ri[k];
138449b5e25fSSatish Balay         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
138549b5e25fSSatish Balay       }
138649b5e25fSSatish Balay     }
138749b5e25fSSatish Balay   }
13889566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(ll,&l));
13899566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*a->nz));
139049b5e25fSSatish Balay   PetscFunctionReturn(0);
139149b5e25fSSatish Balay }
139249b5e25fSSatish Balay 
1393dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
139449b5e25fSSatish Balay {
139549b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
139649b5e25fSSatish Balay 
139749b5e25fSSatish Balay   PetscFunctionBegin;
139849b5e25fSSatish Balay   info->block_size   = a->bs2;
1399ceed8ce5SJed Brown   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
14006c6c5352SBarry Smith   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
14013966268fSBarry Smith   info->nz_unneeded  = info->nz_allocated - info->nz_used;
140249b5e25fSSatish Balay   info->assemblies   = A->num_ass;
14038e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
14047adad957SLisandro Dalcin   info->memory       = ((PetscObject)A)->mem;
1405d5f3da31SBarry Smith   if (A->factortype) {
140649b5e25fSSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
140749b5e25fSSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
140849b5e25fSSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
140949b5e25fSSatish Balay   } else {
141049b5e25fSSatish Balay     info->fill_ratio_given  = 0;
141149b5e25fSSatish Balay     info->fill_ratio_needed = 0;
141249b5e25fSSatish Balay     info->factor_mallocs    = 0;
141349b5e25fSSatish Balay   }
141449b5e25fSSatish Balay   PetscFunctionReturn(0);
141549b5e25fSSatish Balay }
141649b5e25fSSatish Balay 
1417dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
141849b5e25fSSatish Balay {
141949b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
142049b5e25fSSatish Balay 
142149b5e25fSSatish Balay   PetscFunctionBegin;
14229566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a,a->bs2*a->i[a->mbs]));
142349b5e25fSSatish Balay   PetscFunctionReturn(0);
142449b5e25fSSatish Balay }
1425dc354874SHong Zhang 
1426985db425SBarry Smith PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1427dc354874SHong Zhang {
1428dc354874SHong Zhang   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1429d9ca1df4SBarry Smith   PetscInt        i,j,n,row,col,bs,mbs;
1430d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
1431c3fca9a7SHong Zhang   PetscReal       atmp;
1432d9ca1df4SBarry Smith   const MatScalar *aa;
1433985db425SBarry Smith   PetscScalar     *x;
143413f74950SBarry Smith   PetscInt        ncols,brow,bcol,krow,kcol;
1435dc354874SHong Zhang 
1436dc354874SHong Zhang   PetscFunctionBegin;
143728b400f6SJacob Faibussowitsch   PetscCheck(!idx,PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
143828b400f6SJacob Faibussowitsch   PetscCheck(!A->factortype,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1439d0f46423SBarry Smith   bs  = A->rmap->bs;
1440dc354874SHong Zhang   aa  = a->a;
1441dc354874SHong Zhang   ai  = a->i;
1442dc354874SHong Zhang   aj  = a->j;
144344117c81SHong Zhang   mbs = a->mbs;
1444dc354874SHong Zhang 
14459566063dSJacob Faibussowitsch   PetscCall(VecSet(v,0.0));
14469566063dSJacob Faibussowitsch   PetscCall(VecGetArray(v,&x));
14479566063dSJacob Faibussowitsch   PetscCall(VecGetLocalSize(v,&n));
144808401ef6SPierre Jolivet   PetscCheck(n == A->rmap->N,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
144944117c81SHong Zhang   for (i=0; i<mbs; i++) {
1450d0f6400bSHong Zhang     ncols = ai[1] - ai[0]; ai++;
1451d0f6400bSHong Zhang     brow  = bs*i;
145244117c81SHong Zhang     for (j=0; j<ncols; j++) {
1453d0f6400bSHong Zhang       bcol = bs*(*aj);
145444117c81SHong Zhang       for (kcol=0; kcol<bs; kcol++) {
1455d0f6400bSHong Zhang         col = bcol + kcol;      /* col index */
145644117c81SHong Zhang         for (krow=0; krow<bs; krow++) {
1457d0f6400bSHong Zhang           atmp = PetscAbsScalar(*aa); aa++;
1458d0f6400bSHong Zhang           row  = brow + krow;   /* row index */
1459c3fca9a7SHong Zhang           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1460c3fca9a7SHong Zhang           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
146144117c81SHong Zhang         }
146244117c81SHong Zhang       }
1463d0f6400bSHong Zhang       aj++;
1464dc354874SHong Zhang     }
1465dc354874SHong Zhang   }
14669566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(v,&x));
1467dc354874SHong Zhang   PetscFunctionReturn(0);
1468dc354874SHong Zhang }
1469c2916339SPierre Jolivet 
14704222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1471c2916339SPierre Jolivet {
1472c2916339SPierre Jolivet   PetscFunctionBegin;
14739566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C));
14744222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1475c2916339SPierre Jolivet   PetscFunctionReturn(0);
1476c2916339SPierre Jolivet }
1477c2916339SPierre Jolivet 
1478c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1479c2916339SPierre Jolivet {
1480c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1481c2916339SPierre Jolivet   PetscScalar       *z = c;
1482c2916339SPierre Jolivet   const PetscScalar *xb;
1483c2916339SPierre Jolivet   PetscScalar       x1;
1484c2916339SPierre Jolivet   const MatScalar   *v = a->a,*vv;
1485c2916339SPierre Jolivet   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
14863c2e41e1SStefano Zampini #if defined(PETSC_USE_COMPLEX)
1487*b94d7dedSBarry Smith   const int         aconj = A->hermitian == PETSC_BOOL3_TRUE;
14883c2e41e1SStefano Zampini #else
14893c2e41e1SStefano Zampini   const int         aconj = 0;
14903c2e41e1SStefano Zampini #endif
1491c2916339SPierre Jolivet 
1492c2916339SPierre Jolivet   PetscFunctionBegin;
1493c2916339SPierre Jolivet   for (i=0; i<mbs; i++) {
1494c2916339SPierre Jolivet     n = ii[1] - ii[0]; ii++;
1495c2916339SPierre Jolivet     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1496c2916339SPierre Jolivet     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1497c2916339SPierre Jolivet     jj = idx;
1498c2916339SPierre Jolivet     vv = v;
1499c2916339SPierre Jolivet     for (k=0; k<cn; k++) {
1500c2916339SPierre Jolivet       idx = jj;
1501c2916339SPierre Jolivet       v = vv;
1502c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1503c2916339SPierre Jolivet         xb = b + (*idx); x1 = xb[0+k*bm];
1504c2916339SPierre Jolivet         z[0+k*cm] += v[0]*x1;
15053c2e41e1SStefano Zampini         if (*idx != i) c[(*idx)+k*cm] += (aconj ? PetscConj(v[0]) : v[0])*b[i+k*bm];
1506c2916339SPierre Jolivet         v += 1;
1507c2916339SPierre Jolivet         ++idx;
1508c2916339SPierre Jolivet       }
1509c2916339SPierre Jolivet     }
1510c2916339SPierre Jolivet     z += 1;
1511c2916339SPierre Jolivet   }
1512c2916339SPierre Jolivet   PetscFunctionReturn(0);
1513c2916339SPierre Jolivet }
1514c2916339SPierre Jolivet 
1515c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1516c2916339SPierre Jolivet {
1517c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1518c2916339SPierre Jolivet   PetscScalar       *z = c;
1519c2916339SPierre Jolivet   const PetscScalar *xb;
1520c2916339SPierre Jolivet   PetscScalar       x1,x2;
1521c2916339SPierre Jolivet   const MatScalar   *v = a->a,*vv;
1522c2916339SPierre Jolivet   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1523c2916339SPierre Jolivet 
1524c2916339SPierre Jolivet   PetscFunctionBegin;
1525c2916339SPierre Jolivet   for (i=0; i<mbs; i++) {
1526c2916339SPierre Jolivet     n = ii[1] - ii[0]; ii++;
1527c2916339SPierre Jolivet     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1528c2916339SPierre Jolivet     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1529c2916339SPierre Jolivet     jj = idx;
1530c2916339SPierre Jolivet     vv = v;
1531c2916339SPierre Jolivet     for (k=0; k<cn; k++) {
1532c2916339SPierre Jolivet       idx = jj;
1533c2916339SPierre Jolivet       v = vv;
1534c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1535c2916339SPierre Jolivet         xb    = b + 2*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
1536c2916339SPierre Jolivet         z[0+k*cm] += v[0]*x1 + v[2]*x2;
1537c2916339SPierre Jolivet         z[1+k*cm] += v[1]*x1 + v[3]*x2;
1538c2916339SPierre Jolivet         if (*idx != i) {
1539c2916339SPierre Jolivet           c[2*(*idx)+0+k*cm] += v[0]*b[2*i+k*bm] + v[1]*b[2*i+1+k*bm];
1540c2916339SPierre Jolivet           c[2*(*idx)+1+k*cm] += v[2]*b[2*i+k*bm] + v[3]*b[2*i+1+k*bm];
1541c2916339SPierre Jolivet         }
1542c2916339SPierre Jolivet         v += 4;
1543c2916339SPierre Jolivet         ++idx;
1544c2916339SPierre Jolivet       }
1545c2916339SPierre Jolivet     }
1546c2916339SPierre Jolivet     z += 2;
1547c2916339SPierre Jolivet   }
1548c2916339SPierre Jolivet   PetscFunctionReturn(0);
1549c2916339SPierre Jolivet }
1550c2916339SPierre Jolivet 
1551c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1552c2916339SPierre Jolivet {
1553c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1554c2916339SPierre Jolivet   PetscScalar       *z = c;
1555c2916339SPierre Jolivet   const PetscScalar *xb;
1556c2916339SPierre Jolivet   PetscScalar       x1,x2,x3;
1557c2916339SPierre Jolivet   const MatScalar   *v = a->a,*vv;
1558c2916339SPierre Jolivet   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1559c2916339SPierre Jolivet 
1560c2916339SPierre Jolivet   PetscFunctionBegin;
1561c2916339SPierre Jolivet   for (i=0; i<mbs; i++) {
1562c2916339SPierre Jolivet     n = ii[1] - ii[0]; ii++;
1563c2916339SPierre Jolivet     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1564c2916339SPierre Jolivet     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1565c2916339SPierre Jolivet     jj = idx;
1566c2916339SPierre Jolivet     vv = v;
1567c2916339SPierre Jolivet     for (k=0; k<cn; k++) {
1568c2916339SPierre Jolivet       idx = jj;
1569c2916339SPierre Jolivet       v = vv;
1570c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1571c2916339SPierre Jolivet         xb = b + 3*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
1572c2916339SPierre Jolivet         z[0+k*cm] += v[0]*x1 + v[3]*x2 + v[6]*x3;
1573c2916339SPierre Jolivet         z[1+k*cm] += v[1]*x1 + v[4]*x2 + v[7]*x3;
1574c2916339SPierre Jolivet         z[2+k*cm] += v[2]*x1 + v[5]*x2 + v[8]*x3;
1575c2916339SPierre Jolivet         if (*idx != i) {
1576c2916339SPierre Jolivet           c[3*(*idx)+0+k*cm] += v[0]*b[3*i+k*bm] + v[3]*b[3*i+1+k*bm] + v[6]*b[3*i+2+k*bm];
1577c2916339SPierre Jolivet           c[3*(*idx)+1+k*cm] += v[1]*b[3*i+k*bm] + v[4]*b[3*i+1+k*bm] + v[7]*b[3*i+2+k*bm];
1578c2916339SPierre Jolivet           c[3*(*idx)+2+k*cm] += v[2]*b[3*i+k*bm] + v[5]*b[3*i+1+k*bm] + v[8]*b[3*i+2+k*bm];
1579c2916339SPierre Jolivet         }
1580c2916339SPierre Jolivet         v += 9;
1581c2916339SPierre Jolivet         ++idx;
1582c2916339SPierre Jolivet       }
1583c2916339SPierre Jolivet     }
1584c2916339SPierre Jolivet     z += 3;
1585c2916339SPierre Jolivet   }
1586c2916339SPierre Jolivet   PetscFunctionReturn(0);
1587c2916339SPierre Jolivet }
1588c2916339SPierre Jolivet 
1589c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1590c2916339SPierre Jolivet {
1591c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1592c2916339SPierre Jolivet   PetscScalar       *z = c;
1593c2916339SPierre Jolivet   const PetscScalar *xb;
1594c2916339SPierre Jolivet   PetscScalar       x1,x2,x3,x4;
1595c2916339SPierre Jolivet   const MatScalar   *v = a->a,*vv;
1596c2916339SPierre Jolivet   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1597c2916339SPierre Jolivet 
1598c2916339SPierre Jolivet   PetscFunctionBegin;
1599c2916339SPierre Jolivet   for (i=0; i<mbs; i++) {
1600c2916339SPierre Jolivet     n = ii[1] - ii[0]; ii++;
1601c2916339SPierre Jolivet     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1602c2916339SPierre Jolivet     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1603c2916339SPierre Jolivet     jj = idx;
1604c2916339SPierre Jolivet     vv = v;
1605c2916339SPierre Jolivet     for (k=0; k<cn; k++) {
1606c2916339SPierre Jolivet       idx = jj;
1607c2916339SPierre Jolivet       v = vv;
1608c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1609c2916339SPierre Jolivet         xb = b + 4*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
1610c2916339SPierre Jolivet         z[0+k*cm] += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1611c2916339SPierre Jolivet         z[1+k*cm] += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1612c2916339SPierre Jolivet         z[2+k*cm] += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1613c2916339SPierre Jolivet         z[3+k*cm] += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1614c2916339SPierre Jolivet         if (*idx != i) {
1615c2916339SPierre Jolivet           c[4*(*idx)+0+k*cm] += v[0]*b[4*i+k*bm] + v[4]*b[4*i+1+k*bm] + v[8]*b[4*i+2+k*bm]  + v[12]*b[4*i+3+k*bm];
1616c2916339SPierre Jolivet           c[4*(*idx)+1+k*cm] += v[1]*b[4*i+k*bm] + v[5]*b[4*i+1+k*bm] + v[9]*b[4*i+2+k*bm]  + v[13]*b[4*i+3+k*bm];
1617c2916339SPierre Jolivet           c[4*(*idx)+2+k*cm] += v[2]*b[4*i+k*bm] + v[6]*b[4*i+1+k*bm] + v[10]*b[4*i+2+k*bm] + v[14]*b[4*i+3+k*bm];
1618c2916339SPierre Jolivet           c[4*(*idx)+3+k*cm] += v[3]*b[4*i+k*bm] + v[7]*b[4*i+1+k*bm] + v[11]*b[4*i+2+k*bm] + v[15]*b[4*i+3+k*bm];
1619c2916339SPierre Jolivet         }
1620c2916339SPierre Jolivet         v += 16;
1621c2916339SPierre Jolivet         ++idx;
1622c2916339SPierre Jolivet       }
1623c2916339SPierre Jolivet     }
1624c2916339SPierre Jolivet     z += 4;
1625c2916339SPierre Jolivet   }
1626c2916339SPierre Jolivet   PetscFunctionReturn(0);
1627c2916339SPierre Jolivet }
1628c2916339SPierre Jolivet 
1629c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1630c2916339SPierre Jolivet {
1631c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1632c2916339SPierre Jolivet   PetscScalar       *z = c;
1633c2916339SPierre Jolivet   const PetscScalar *xb;
1634c2916339SPierre Jolivet   PetscScalar       x1,x2,x3,x4,x5;
1635c2916339SPierre Jolivet   const MatScalar   *v = a->a,*vv;
1636c2916339SPierre Jolivet   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1637c2916339SPierre Jolivet 
1638c2916339SPierre Jolivet   PetscFunctionBegin;
1639c2916339SPierre Jolivet   for (i=0; i<mbs; i++) {
1640c2916339SPierre Jolivet     n = ii[1] - ii[0]; ii++;
1641c2916339SPierre Jolivet     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1642c2916339SPierre Jolivet     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1643c2916339SPierre Jolivet     jj = idx;
1644c2916339SPierre Jolivet     vv = v;
1645c2916339SPierre Jolivet     for (k=0; k<cn; k++) {
1646c2916339SPierre Jolivet       idx = jj;
1647c2916339SPierre Jolivet       v = vv;
1648c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1649c2916339SPierre Jolivet         xb = b + 5*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*cm];
1650c2916339SPierre Jolivet         z[0+k*cm] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1651c2916339SPierre Jolivet         z[1+k*cm] += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1652c2916339SPierre Jolivet         z[2+k*cm] += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1653c2916339SPierre Jolivet         z[3+k*cm] += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1654c2916339SPierre Jolivet         z[4+k*cm] += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1655c2916339SPierre Jolivet         if (*idx != i) {
1656c2916339SPierre Jolivet           c[5*(*idx)+0+k*cm] += v[0]*b[5*i+k*bm] + v[5]*b[5*i+1+k*bm] + v[10]*b[5*i+2+k*bm] + v[15]*b[5*i+3+k*bm] + v[20]*b[5*i+4+k*bm];
1657c2916339SPierre Jolivet           c[5*(*idx)+1+k*cm] += v[1]*b[5*i+k*bm] + v[6]*b[5*i+1+k*bm] + v[11]*b[5*i+2+k*bm] + v[16]*b[5*i+3+k*bm] + v[21]*b[5*i+4+k*bm];
1658c2916339SPierre Jolivet           c[5*(*idx)+2+k*cm] += v[2]*b[5*i+k*bm] + v[7]*b[5*i+1+k*bm] + v[12]*b[5*i+2+k*bm] + v[17]*b[5*i+3+k*bm] + v[22]*b[5*i+4+k*bm];
1659c2916339SPierre Jolivet           c[5*(*idx)+3+k*cm] += v[3]*b[5*i+k*bm] + v[8]*b[5*i+1+k*bm] + v[13]*b[5*i+2+k*bm] + v[18]*b[5*i+3+k*bm] + v[23]*b[5*i+4+k*bm];
1660c2916339SPierre Jolivet           c[5*(*idx)+4+k*cm] += v[4]*b[5*i+k*bm] + v[9]*b[5*i+1+k*bm] + v[14]*b[5*i+2+k*bm] + v[19]*b[5*i+3+k*bm] + v[24]*b[5*i+4+k*bm];
1661c2916339SPierre Jolivet         }
1662c2916339SPierre Jolivet         v += 25;
1663c2916339SPierre Jolivet         ++idx;
1664c2916339SPierre Jolivet       }
1665c2916339SPierre Jolivet     }
1666c2916339SPierre Jolivet     z += 5;
1667c2916339SPierre Jolivet   }
1668c2916339SPierre Jolivet   PetscFunctionReturn(0);
1669c2916339SPierre Jolivet }
1670c2916339SPierre Jolivet 
1671c2916339SPierre Jolivet PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C)
1672c2916339SPierre Jolivet {
1673c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1674c2916339SPierre Jolivet   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
1675281439baSStefano Zampini   Mat_SeqDense      *cd = (Mat_SeqDense*)C->data;
1676c2916339SPierre Jolivet   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
1677c2916339SPierre Jolivet   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1678c2916339SPierre Jolivet   PetscBLASInt      bbs,bcn,bbm,bcm;
1679f4259b30SLisandro Dalcin   PetscScalar       *z = NULL;
1680c2916339SPierre Jolivet   PetscScalar       *c,*b;
1681c2916339SPierre Jolivet   const MatScalar   *v;
1682c2916339SPierre Jolivet   const PetscInt    *idx,*ii;
1683c2916339SPierre Jolivet   PetscScalar       _DOne=1.0;
1684c2916339SPierre Jolivet 
1685c2916339SPierre Jolivet   PetscFunctionBegin;
1686c2916339SPierre Jolivet   if (!cm || !cn) PetscFunctionReturn(0);
168708401ef6SPierre Jolivet   PetscCheck(B->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n);
168808401ef6SPierre Jolivet   PetscCheck(A->rmap->n == C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n);
168908401ef6SPierre Jolivet   PetscCheck(B->cmap->n == C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n);
1690c2916339SPierre Jolivet   b = bd->v;
16919566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
16929566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArray(C,&c));
1693c2916339SPierre Jolivet   switch (bs) {
1694c2916339SPierre Jolivet   case 1:
16959566063dSJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1696c2916339SPierre Jolivet     break;
1697c2916339SPierre Jolivet   case 2:
16989566063dSJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1699c2916339SPierre Jolivet     break;
1700c2916339SPierre Jolivet   case 3:
17019566063dSJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1702c2916339SPierre Jolivet     break;
1703c2916339SPierre Jolivet   case 4:
17049566063dSJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1705c2916339SPierre Jolivet     break;
1706c2916339SPierre Jolivet   case 5:
17079566063dSJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1708c2916339SPierre Jolivet     break;
1709c2916339SPierre Jolivet   default: /* block sizes larger than 5 by 5 are handled by BLAS */
17109566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bs,&bbs));
17119566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cn,&bcn));
17129566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bm,&bbm));
17139566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cm,&bcm));
1714c2916339SPierre Jolivet     idx = a->j;
1715c2916339SPierre Jolivet     v   = a->a;
1716c2916339SPierre Jolivet     mbs = a->mbs;
1717c2916339SPierre Jolivet     ii  = a->i;
1718c2916339SPierre Jolivet     z   = c;
1719c2916339SPierre Jolivet     for (i=0; i<mbs; i++) {
1720c2916339SPierre Jolivet       n = ii[1] - ii[0]; ii++;
1721c2916339SPierre Jolivet       for (j=0; j<n; j++) {
17226718818eSStefano Zampini         if (*idx != i) PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*i,&bbm,&_DOne,c+bs*(*idx),&bcm));
1723c2916339SPierre Jolivet         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
1724c2916339SPierre Jolivet         v += bs2;
1725c2916339SPierre Jolivet       }
1726c2916339SPierre Jolivet       z += bs;
1727c2916339SPierre Jolivet     }
1728c2916339SPierre Jolivet   }
17299566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArray(C,&c));
17309566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops((2.0*(a->nz*2.0 - a->nonzerorowcnt)*bs2 - a->nonzerorowcnt)*cn));
1731c2916339SPierre Jolivet   PetscFunctionReturn(0);
1732c2916339SPierre Jolivet }
1733