xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 6718818e67c4802797f2feae43fc3d52878b6955)
149b5e25fSSatish Balay 
2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
3c2916339SPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h>
4c2916339SPierre Jolivet #include <../src/mat/impls/sbaij/seq/sbaij.h>
5af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
6c6db04a5SJed Brown #include <petscbt.h>
7c6db04a5SJed Brown #include <petscblaslapack.h>
849b5e25fSSatish Balay 
913f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
1049b5e25fSSatish Balay {
115eee224dSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
126849ba73SBarry Smith   PetscErrorCode ierr;
135d0c19d7SBarry Smith   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
145d0c19d7SBarry Smith   const PetscInt *idx;
15db41cccfSHong Zhang   PetscBT        table_out,table_in;
16d94109b8SHong Zhang 
17d94109b8SHong Zhang   PetscFunctionBegin;
18e32f2f54SBarry Smith   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
19d94109b8SHong Zhang   mbs  = a->mbs;
20d94109b8SHong Zhang   ai   = a->i;
21d94109b8SHong Zhang   aj   = a->j;
22d0f46423SBarry Smith   bs   = A->rmap->bs;
2353b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
24854ce69bSBarry Smith   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
25854ce69bSBarry Smith   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
2653b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
27d94109b8SHong Zhang 
28d94109b8SHong Zhang   for (i=0; i<is_max; i++) { /* for each is */
29d94109b8SHong Zhang     isz  = 0;
30db41cccfSHong Zhang     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
31d94109b8SHong Zhang 
32d94109b8SHong Zhang     /* Extract the indices, assume there can be duplicate entries */
33d94109b8SHong Zhang     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
34d94109b8SHong Zhang     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
35d94109b8SHong Zhang 
36db41cccfSHong Zhang     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
37dbe03f88SHong Zhang     bcol_max = 0;
38d94109b8SHong Zhang     for (j=0; j<n; ++j) {
39d94109b8SHong Zhang       brow = idx[j]/bs; /* convert the indices into block indices */
40e32f2f54SBarry Smith       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
41db41cccfSHong Zhang       if (!PetscBTLookupSet(table_out,brow)) {
42dbe03f88SHong Zhang         nidx[isz++] = brow;
43dbe03f88SHong Zhang         if (bcol_max < brow) bcol_max = brow;
44dbe03f88SHong Zhang       }
45d94109b8SHong Zhang     }
46d94109b8SHong Zhang     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
476bf464f9SBarry Smith     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
48d94109b8SHong Zhang 
49d94109b8SHong Zhang     k = 0;
50d94109b8SHong Zhang     for (j=0; j<ov; j++) { /* for each overlap */
51db41cccfSHong Zhang       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
52db41cccfSHong Zhang       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
53db41cccfSHong Zhang       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
54d94109b8SHong Zhang 
55d94109b8SHong Zhang       n = isz;  /* length of the updated is[i] */
56d94109b8SHong Zhang       for (brow=0; brow<mbs; brow++) {
57d94109b8SHong Zhang         start = ai[brow]; end   = ai[brow+1];
58db41cccfSHong Zhang         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
59d94109b8SHong Zhang           for (l = start; l<end; l++) {
60d94109b8SHong Zhang             bcol = aj[l];
61d7b97159SHong Zhang             if (!PetscBTLookupSet(table_out,bcol)) {
62d7b97159SHong Zhang               nidx[isz++] = bcol;
63d7b97159SHong Zhang               if (bcol_max < bcol) bcol_max = bcol;
64d7b97159SHong Zhang             }
65d94109b8SHong Zhang           }
66d94109b8SHong Zhang           k++;
67d94109b8SHong Zhang           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
68d94109b8SHong Zhang         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
69d94109b8SHong Zhang           for (l = start; l<end; l++) {
70d94109b8SHong Zhang             bcol = aj[l];
71dbe03f88SHong Zhang             if (bcol > bcol_max) break;
72db41cccfSHong Zhang             if (PetscBTLookup(table_in,bcol)) {
7326fbe8dcSKarl Rupp               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
74d94109b8SHong Zhang               break; /* for l = start; l<end ; l++) */
75d94109b8SHong Zhang             }
76d94109b8SHong Zhang           }
77d94109b8SHong Zhang         }
78d94109b8SHong Zhang       }
79d94109b8SHong Zhang     } /* for each overlap */
80d94109b8SHong Zhang 
81d94109b8SHong Zhang     /* expand the Index Set */
82d94109b8SHong Zhang     for (j=0; j<isz; j++) {
8326fbe8dcSKarl Rupp       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
84d94109b8SHong Zhang     }
8570b3c8c7SBarry Smith     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
86d94109b8SHong Zhang   }
8794bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
88d94109b8SHong Zhang   ierr = PetscFree(nidx);CHKERRQ(ierr);
89d94109b8SHong Zhang   ierr = PetscFree(nidx2);CHKERRQ(ierr);
9094bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
915eee224dSHong Zhang   PetscFunctionReturn(0);
9249b5e25fSSatish Balay }
9349b5e25fSSatish Balay 
94847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
95847daeccSHong Zhang         Zero some ops' to avoid invalid usse */
96847daeccSHong Zhang PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
9749b5e25fSSatish Balay {
986849ba73SBarry Smith   PetscErrorCode ierr;
9949b5e25fSSatish Balay 
10049b5e25fSSatish Balay   PetscFunctionBegin;
101847daeccSHong Zhang   ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr);
102847daeccSHong Zhang   Bseq->ops->mult                   = 0;
103847daeccSHong Zhang   Bseq->ops->multadd                = 0;
104847daeccSHong Zhang   Bseq->ops->multtranspose          = 0;
105847daeccSHong Zhang   Bseq->ops->multtransposeadd       = 0;
106847daeccSHong Zhang   Bseq->ops->lufactor               = 0;
107847daeccSHong Zhang   Bseq->ops->choleskyfactor         = 0;
108847daeccSHong Zhang   Bseq->ops->lufactorsymbolic       = 0;
109847daeccSHong Zhang   Bseq->ops->choleskyfactorsymbolic = 0;
110847daeccSHong Zhang   Bseq->ops->getinertia             = 0;
11149b5e25fSSatish Balay   PetscFunctionReturn(0);
11249b5e25fSSatish Balay }
11349b5e25fSSatish Balay 
1147dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
1157dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
116e50a8114SHong Zhang {
117e50a8114SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*c;
118e50a8114SHong Zhang   PetscErrorCode ierr;
119e50a8114SHong Zhang   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
120e50a8114SHong Zhang   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
121e50a8114SHong Zhang   const PetscInt *irow,*icol;
122e50a8114SHong Zhang   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
123e50a8114SHong Zhang   PetscInt       *aj = a->j,*ai = a->i;
124e50a8114SHong Zhang   MatScalar      *mat_a;
125e50a8114SHong Zhang   Mat            C;
126e50a8114SHong Zhang   PetscBool      flag;
127e50a8114SHong Zhang 
128e50a8114SHong Zhang   PetscFunctionBegin;
129e50a8114SHong Zhang 
130e50a8114SHong Zhang   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
131e50a8114SHong Zhang   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
132e50a8114SHong Zhang   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
133e50a8114SHong Zhang   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
134e50a8114SHong Zhang 
135e50a8114SHong Zhang   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
136e50a8114SHong Zhang   ssmap = smap;
137e50a8114SHong Zhang   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
138e50a8114SHong Zhang   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
139e50a8114SHong Zhang   /* determine lens of each row */
140e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
141e50a8114SHong Zhang     kstart  = ai[irow[i]];
142e50a8114SHong Zhang     kend    = kstart + a->ilen[irow[i]];
143e50a8114SHong Zhang     lens[i] = 0;
144e50a8114SHong Zhang     for (k=kstart; k<kend; k++) {
145e50a8114SHong Zhang       if (ssmap[aj[k]]) lens[i]++;
146e50a8114SHong Zhang     }
147e50a8114SHong Zhang   }
148e50a8114SHong Zhang   /* Create and fill new matrix */
149e50a8114SHong Zhang   if (scall == MAT_REUSE_MATRIX) {
150e50a8114SHong Zhang     c = (Mat_SeqSBAIJ*)((*B)->data);
151e50a8114SHong Zhang 
152e50a8114SHong Zhang     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
153580bdb30SBarry Smith     ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr);
154e50a8114SHong Zhang     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
155580bdb30SBarry Smith     ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr);
156e50a8114SHong Zhang     C    = *B;
157e50a8114SHong Zhang   } else {
158e50a8114SHong Zhang     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
159e50a8114SHong Zhang     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
160e50a8114SHong Zhang     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
161367daffbSBarry Smith     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
162e50a8114SHong Zhang   }
163e50a8114SHong Zhang   c = (Mat_SeqSBAIJ*)(C->data);
164e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
165e50a8114SHong Zhang     row      = irow[i];
166e50a8114SHong Zhang     kstart   = ai[row];
167e50a8114SHong Zhang     kend     = kstart + a->ilen[row];
168e50a8114SHong Zhang     mat_i    = c->i[i];
169e50a8114SHong Zhang     mat_j    = c->j + mat_i;
170e50a8114SHong Zhang     mat_a    = c->a + mat_i*bs2;
171e50a8114SHong Zhang     mat_ilen = c->ilen + i;
172e50a8114SHong Zhang     for (k=kstart; k<kend; k++) {
173e50a8114SHong Zhang       if ((tcol=ssmap[a->j[k]])) {
174e50a8114SHong Zhang         *mat_j++ = tcol - 1;
175580bdb30SBarry Smith         ierr     = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr);
176e50a8114SHong Zhang         mat_a   += bs2;
177e50a8114SHong Zhang         (*mat_ilen)++;
178e50a8114SHong Zhang       }
179e50a8114SHong Zhang     }
180e50a8114SHong Zhang   }
181e50a8114SHong Zhang   /* sort */
182e50a8114SHong Zhang   {
183e50a8114SHong Zhang     MatScalar *work;
184e50a8114SHong Zhang 
185e50a8114SHong Zhang     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
186e50a8114SHong Zhang     for (i=0; i<nrows; i++) {
187e50a8114SHong Zhang       PetscInt ilen;
188e50a8114SHong Zhang       mat_i = c->i[i];
189e50a8114SHong Zhang       mat_j = c->j + mat_i;
190e50a8114SHong Zhang       mat_a = c->a + mat_i*bs2;
191e50a8114SHong Zhang       ilen  = c->ilen[i];
192e50a8114SHong Zhang       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
193e50a8114SHong Zhang     }
194e50a8114SHong Zhang     ierr = PetscFree(work);CHKERRQ(ierr);
195e50a8114SHong Zhang   }
196e50a8114SHong Zhang 
197e50a8114SHong Zhang   /* Free work space */
198e50a8114SHong Zhang   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
199e50a8114SHong Zhang   ierr = PetscFree(smap);CHKERRQ(ierr);
200e50a8114SHong Zhang   ierr = PetscFree(lens);CHKERRQ(ierr);
201e50a8114SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
202e50a8114SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
203e50a8114SHong Zhang 
204e50a8114SHong Zhang   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
205e50a8114SHong Zhang   *B   = C;
206e50a8114SHong Zhang   PetscFunctionReturn(0);
207e50a8114SHong Zhang }
208e50a8114SHong Zhang 
2097dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
21049b5e25fSSatish Balay {
211e50a8114SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
212e50a8114SHong Zhang   IS             is1,is2;
2136849ba73SBarry Smith   PetscErrorCode ierr;
214e50a8114SHong Zhang   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
215e50a8114SHong Zhang   const PetscInt *irow,*icol;
21649b5e25fSSatish Balay 
21749b5e25fSSatish Balay   PetscFunctionBegin;
218e50a8114SHong Zhang   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
219e50a8114SHong Zhang   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
220e50a8114SHong Zhang   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
221e50a8114SHong Zhang   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
222e50a8114SHong Zhang 
223e50a8114SHong Zhang   /* Verify if the indices corespond to each element in a block
224e50a8114SHong Zhang    and form the IS with compressed IS */
225e50a8114SHong Zhang   maxmnbs = PetscMax(a->mbs,a->nbs);
226e50a8114SHong Zhang   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
227580bdb30SBarry Smith   ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr);
228e50a8114SHong Zhang   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
229e50a8114SHong Zhang   for (i=0; i<a->mbs; i++) {
230e50a8114SHong Zhang     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
231e50a8114SHong Zhang   }
232e50a8114SHong Zhang   count = 0;
233e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
234e50a8114SHong Zhang     PetscInt j = irow[i] / bs;
235e50a8114SHong Zhang     if ((vary[j]--)==bs) iary[count++] = j;
236e50a8114SHong Zhang   }
237e50a8114SHong Zhang   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
238e50a8114SHong Zhang 
239580bdb30SBarry Smith   ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr);
240e50a8114SHong Zhang   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
241e50a8114SHong Zhang   for (i=0; i<a->nbs; i++) {
242e50a8114SHong Zhang     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
243e50a8114SHong Zhang   }
244e50a8114SHong Zhang   count = 0;
245e50a8114SHong Zhang   for (i=0; i<ncols; i++) {
246e50a8114SHong Zhang     PetscInt j = icol[i] / bs;
247e50a8114SHong Zhang     if ((vary[j]--)==bs) iary[count++] = j;
248e50a8114SHong Zhang   }
249e50a8114SHong Zhang   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
250e50a8114SHong Zhang   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
251e50a8114SHong Zhang   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
252e50a8114SHong Zhang   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
253e50a8114SHong Zhang 
2547dae84e0SHong Zhang   ierr = MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
255e50a8114SHong Zhang   ierr = ISDestroy(&is1);CHKERRQ(ierr);
256e50a8114SHong Zhang   ierr = ISDestroy(&is2);CHKERRQ(ierr);
257847daeccSHong Zhang 
2588f46ffcaSHong Zhang   if (isrow != iscol) {
2598f46ffcaSHong Zhang     PetscBool isequal;
2608f46ffcaSHong Zhang     ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
261847daeccSHong Zhang     if (!isequal) {
262847daeccSHong Zhang       ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr);
2638f46ffcaSHong Zhang     }
26449b5e25fSSatish Balay   }
26549b5e25fSSatish Balay   PetscFunctionReturn(0);
26649b5e25fSSatish Balay }
26749b5e25fSSatish Balay 
2687dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
26949b5e25fSSatish Balay {
2706849ba73SBarry Smith   PetscErrorCode ierr;
27113f74950SBarry Smith   PetscInt       i;
27249b5e25fSSatish Balay 
27349b5e25fSSatish Balay   PetscFunctionBegin;
274e50a8114SHong Zhang   if (scall == MAT_INITIAL_MATRIX) {
275df750dc8SHong Zhang     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
276afebec48SHong Zhang   }
277e50a8114SHong Zhang 
278e50a8114SHong Zhang   for (i=0; i<n; i++) {
2797dae84e0SHong Zhang     ierr = MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
28049b5e25fSSatish Balay   }
28149b5e25fSSatish Balay   PetscFunctionReturn(0);
28249b5e25fSSatish Balay }
28349b5e25fSSatish Balay 
28449b5e25fSSatish Balay /* -------------------------------------------------------*/
28549b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */
28649b5e25fSSatish Balay /* -------------------------------------------------------*/
28749b5e25fSSatish Balay 
288dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
28949b5e25fSSatish Balay {
29049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
291d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,zero=0.0;
292d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
293d9ca1df4SBarry Smith   const MatScalar   *v;
2946849ba73SBarry Smith   PetscErrorCode    ierr;
295d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
296d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
29798c9bda7SSatish Balay   PetscInt          nonzerorow=0;
29849b5e25fSSatish Balay 
29949b5e25fSSatish Balay   PetscFunctionBegin;
3002dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
301c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
302d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3031ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
30449b5e25fSSatish Balay 
30549b5e25fSSatish Balay   v  = a->a;
30649b5e25fSSatish Balay   xb = x;
30749b5e25fSSatish Balay 
30849b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
30949b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
31049b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
31149b5e25fSSatish Balay     ib          = aj + *ai;
312831a3094SHong Zhang     jmin        = 0;
31398c9bda7SSatish Balay     nonzerorow += (n>0);
3147fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
31549b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
31649b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
317831a3094SHong Zhang       v        += 4; jmin++;
3187fbae186SHong Zhang     }
319444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
320444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
321831a3094SHong Zhang     for (j=jmin; j<n; j++) {
32249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
32349b5e25fSSatish Balay       cval       = ib[j]*2;
32449b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
32549b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
32649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
32749b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
32849b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
32949b5e25fSSatish Balay       v        += 4;
33049b5e25fSSatish Balay     }
33149b5e25fSSatish Balay     xb +=2; ai++;
33249b5e25fSSatish Balay   }
33349b5e25fSSatish Balay 
334d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
3351ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
336dc0b31edSSatish Balay   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
33749b5e25fSSatish Balay   PetscFunctionReturn(0);
33849b5e25fSSatish Balay }
33949b5e25fSSatish Balay 
340dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
34149b5e25fSSatish Balay {
34249b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
343d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,zero=0.0;
344d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
345d9ca1df4SBarry Smith   const MatScalar   *v;
3466849ba73SBarry Smith   PetscErrorCode    ierr;
347d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
348d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
34998c9bda7SSatish Balay   PetscInt          nonzerorow=0;
35049b5e25fSSatish Balay 
35149b5e25fSSatish Balay   PetscFunctionBegin;
3522dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
353c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
354d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3551ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
35649b5e25fSSatish Balay 
35749b5e25fSSatish Balay   v  = a->a;
35849b5e25fSSatish Balay   xb = x;
35949b5e25fSSatish Balay 
36049b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
36149b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
36249b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
36349b5e25fSSatish Balay     ib          = aj + *ai;
364831a3094SHong Zhang     jmin        = 0;
36598c9bda7SSatish Balay     nonzerorow += (n>0);
3667fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
36749b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
36849b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
36949b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
370831a3094SHong Zhang       v        += 9; jmin++;
3717fbae186SHong Zhang     }
372444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
373444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
374831a3094SHong Zhang     for (j=jmin; j<n; j++) {
37549b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
37649b5e25fSSatish Balay       cval       = ib[j]*3;
37749b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
37849b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
37949b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
38049b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
38149b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
38249b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
38349b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
38449b5e25fSSatish Balay       v        += 9;
38549b5e25fSSatish Balay     }
38649b5e25fSSatish Balay     xb +=3; ai++;
38749b5e25fSSatish Balay   }
38849b5e25fSSatish Balay 
389d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
3901ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
391dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
39249b5e25fSSatish Balay   PetscFunctionReturn(0);
39349b5e25fSSatish Balay }
39449b5e25fSSatish Balay 
395dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
39649b5e25fSSatish Balay {
39749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
398d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
399d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
400d9ca1df4SBarry Smith   const MatScalar   *v;
4016849ba73SBarry Smith   PetscErrorCode    ierr;
402d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
403d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
40498c9bda7SSatish Balay   PetscInt          nonzerorow = 0;
40549b5e25fSSatish Balay 
40649b5e25fSSatish Balay   PetscFunctionBegin;
4072dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
408c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
409d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4101ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
41149b5e25fSSatish Balay 
41249b5e25fSSatish Balay   v  = a->a;
41349b5e25fSSatish Balay   xb = x;
41449b5e25fSSatish Balay 
41549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
41649b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
41749b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
41849b5e25fSSatish Balay     ib          = aj + *ai;
419831a3094SHong Zhang     jmin        = 0;
42098c9bda7SSatish Balay     nonzerorow += (n>0);
4217fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
42249b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
42349b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
42449b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
42549b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
426831a3094SHong Zhang       v        += 16; jmin++;
4277fbae186SHong Zhang     }
428444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
429444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
430831a3094SHong Zhang     for (j=jmin; j<n; j++) {
43149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
43249b5e25fSSatish Balay       cval       = ib[j]*4;
43349b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
43449b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
43549b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
43649b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
43749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
43849b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
43949b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
44049b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
44149b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
44249b5e25fSSatish Balay       v        += 16;
44349b5e25fSSatish Balay     }
44449b5e25fSSatish Balay     xb +=4; ai++;
44549b5e25fSSatish Balay   }
44649b5e25fSSatish Balay 
447d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
4481ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
449dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
45049b5e25fSSatish Balay   PetscFunctionReturn(0);
45149b5e25fSSatish Balay }
45249b5e25fSSatish Balay 
453dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
45449b5e25fSSatish Balay {
45549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
456d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
457d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
458d9ca1df4SBarry Smith   const MatScalar   *v;
4596849ba73SBarry Smith   PetscErrorCode    ierr;
460d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
461d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
46298c9bda7SSatish Balay   PetscInt          nonzerorow=0;
46349b5e25fSSatish Balay 
46449b5e25fSSatish Balay   PetscFunctionBegin;
4652dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
466c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
467d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4681ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
46949b5e25fSSatish Balay 
47049b5e25fSSatish Balay   v  = a->a;
47149b5e25fSSatish Balay   xb = x;
47249b5e25fSSatish Balay 
47349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
47449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
47549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
47649b5e25fSSatish Balay     ib          = aj + *ai;
477831a3094SHong Zhang     jmin        = 0;
47898c9bda7SSatish Balay     nonzerorow += (n>0);
4797fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
48049b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
48149b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
48249b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
48349b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
48449b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
485831a3094SHong Zhang       v        += 25; jmin++;
4867fbae186SHong Zhang     }
487444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
488444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
489831a3094SHong Zhang     for (j=jmin; j<n; j++) {
49049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
49149b5e25fSSatish Balay       cval       = ib[j]*5;
49249b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
49349b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
49449b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
49549b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
49649b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
49749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
49849b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
49949b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
50049b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
50149b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
50249b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
50349b5e25fSSatish Balay       v        += 25;
50449b5e25fSSatish Balay     }
50549b5e25fSSatish Balay     xb +=5; ai++;
50649b5e25fSSatish Balay   }
50749b5e25fSSatish Balay 
508d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5091ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
510dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
51149b5e25fSSatish Balay   PetscFunctionReturn(0);
51249b5e25fSSatish Balay }
51349b5e25fSSatish Balay 
514dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
51549b5e25fSSatish Balay {
51649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
517d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
518d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
519d9ca1df4SBarry Smith   const MatScalar   *v;
5206849ba73SBarry Smith   PetscErrorCode    ierr;
521d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
522d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
52398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
52449b5e25fSSatish Balay 
52549b5e25fSSatish Balay   PetscFunctionBegin;
5262dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
527c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
528d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
5291ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
53049b5e25fSSatish Balay 
53149b5e25fSSatish Balay   v  = a->a;
53249b5e25fSSatish Balay   xb = x;
53349b5e25fSSatish Balay 
53449b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
53549b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
53649b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
53749b5e25fSSatish Balay     ib          = aj + *ai;
538831a3094SHong Zhang     jmin        = 0;
53998c9bda7SSatish Balay     nonzerorow += (n>0);
5407fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
54149b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
54249b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
54349b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
54449b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
54549b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
54649b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
547831a3094SHong Zhang       v        += 36; jmin++;
5487fbae186SHong Zhang     }
549444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
550444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
551831a3094SHong Zhang     for (j=jmin; j<n; j++) {
55249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
55349b5e25fSSatish Balay       cval       = ib[j]*6;
55449b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
55549b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
55649b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
55749b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
55849b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
55949b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
56049b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
56149b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
56249b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
56349b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
56449b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
56549b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
56649b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
56749b5e25fSSatish Balay       v        += 36;
56849b5e25fSSatish Balay     }
56949b5e25fSSatish Balay     xb +=6; ai++;
57049b5e25fSSatish Balay   }
57149b5e25fSSatish Balay 
572d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5731ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
574dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
57549b5e25fSSatish Balay   PetscFunctionReturn(0);
57649b5e25fSSatish Balay }
577c2916339SPierre Jolivet 
578dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
57949b5e25fSSatish Balay {
58049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
581d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
582d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
583d9ca1df4SBarry Smith   const MatScalar   *v;
5846849ba73SBarry Smith   PetscErrorCode    ierr;
585d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
586d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
58798c9bda7SSatish Balay   PetscInt          nonzerorow=0;
58849b5e25fSSatish Balay 
58949b5e25fSSatish Balay   PetscFunctionBegin;
5902dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
591c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
592d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
5931ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
59449b5e25fSSatish Balay 
59549b5e25fSSatish Balay   v  = a->a;
59649b5e25fSSatish Balay   xb = x;
59749b5e25fSSatish Balay 
59849b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
59949b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
60049b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
60149b5e25fSSatish Balay     ib          = aj + *ai;
602831a3094SHong Zhang     jmin        = 0;
60398c9bda7SSatish Balay     nonzerorow += (n>0);
6047fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
60549b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
60649b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
60749b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
60849b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
60949b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
61049b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
61149b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
612831a3094SHong Zhang       v        += 49; jmin++;
6137fbae186SHong Zhang     }
614444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
615444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
616831a3094SHong Zhang     for (j=jmin; j<n; j++) {
61749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
61849b5e25fSSatish Balay       cval       = ib[j]*7;
61949b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
62049b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
62149b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
62249b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
62349b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
62449b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
62549b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
62649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
62749b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
62849b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
62949b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
63049b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
63149b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
63249b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
63349b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
63449b5e25fSSatish Balay       v       += 49;
63549b5e25fSSatish Balay     }
63649b5e25fSSatish Balay     xb +=7; ai++;
63749b5e25fSSatish Balay   }
638d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
6391ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
640dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
64149b5e25fSSatish Balay   PetscFunctionReturn(0);
64249b5e25fSSatish Balay }
64349b5e25fSSatish Balay 
64449b5e25fSSatish Balay /*
64549b5e25fSSatish Balay     This will not work with MatScalar == float because it calls the BLAS
64649b5e25fSSatish Balay */
647dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
64849b5e25fSSatish Balay {
64949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
650d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
651d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
652d9ca1df4SBarry Smith   const MatScalar   *v;
653dfbe8321SBarry Smith   PetscErrorCode    ierr;
654d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
655d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
65698c9bda7SSatish Balay   PetscInt          nonzerorow=0;
65749b5e25fSSatish Balay 
65849b5e25fSSatish Balay   PetscFunctionBegin;
6592dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
660c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
66159689ffbSStefano Zampini   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
66259689ffbSStefano Zampini   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
66359689ffbSStefano Zampini 
66459689ffbSStefano Zampini   x_ptr = x;
66559689ffbSStefano Zampini   z_ptr = z;
66649b5e25fSSatish Balay 
66749b5e25fSSatish Balay   aj = a->j;
66849b5e25fSSatish Balay   v  = a->a;
66949b5e25fSSatish Balay   ii = a->i;
67049b5e25fSSatish Balay 
67149b5e25fSSatish Balay   if (!a->mult_work) {
672854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
67349b5e25fSSatish Balay   }
67449b5e25fSSatish Balay   work = a->mult_work;
67549b5e25fSSatish Balay 
67649b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
67749b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
67849b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
67998c9bda7SSatish Balay     nonzerorow += (n>0);
68049b5e25fSSatish Balay 
68149b5e25fSSatish Balay     /* upper triangular part */
68249b5e25fSSatish Balay     for (j=0; j<n; j++) {
68349b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
68449b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
68549b5e25fSSatish Balay       workt += bs;
68649b5e25fSSatish Balay     }
68749b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
68896b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
68949b5e25fSSatish Balay 
69049b5e25fSSatish Balay     /* strict lower triangular part */
691831a3094SHong Zhang     idx = aj+ii[0];
69259689ffbSStefano Zampini     if (n && *idx == i) {
69396b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
694831a3094SHong Zhang     }
69596b9376eSHong Zhang 
69649b5e25fSSatish Balay     if (ncols > 0) {
69749b5e25fSSatish Balay       workt = work;
698580bdb30SBarry Smith       ierr  = PetscArrayzero(workt,ncols);CHKERRQ(ierr);
69996b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
700831a3094SHong Zhang       for (j=0; j<n; j++) {
701831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
70249b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
70349b5e25fSSatish Balay         workt += bs;
70449b5e25fSSatish Balay       }
70549b5e25fSSatish Balay     }
70649b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
70749b5e25fSSatish Balay   }
70849b5e25fSSatish Balay 
709d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
7101ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
711dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
71249b5e25fSSatish Balay   PetscFunctionReturn(0);
71349b5e25fSSatish Balay }
71449b5e25fSSatish Balay 
715dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
71649b5e25fSSatish Balay {
71749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
718d9ca1df4SBarry Smith   PetscScalar       *z,x1;
719d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
720d9ca1df4SBarry Smith   const MatScalar   *v;
7216849ba73SBarry Smith   PetscErrorCode    ierr;
722d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
723d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
72498c9bda7SSatish Balay   PetscInt          nonzerorow=0;
725eb1ec7c1SStefano Zampini #if defined(PETSC_USE_COMPLEX)
726eb1ec7c1SStefano Zampini   const int         aconj = A->hermitian;
727eb1ec7c1SStefano Zampini #else
728eb1ec7c1SStefano Zampini   const int         aconj = 0;
729eb1ec7c1SStefano Zampini #endif
73049b5e25fSSatish Balay 
73149b5e25fSSatish Balay   PetscFunctionBegin;
732343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
733c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
734d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7351ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
73649b5e25fSSatish Balay   v    = a->a;
73749b5e25fSSatish Balay   xb   = x;
73849b5e25fSSatish Balay 
73949b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
74049b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th row of A */
74149b5e25fSSatish Balay     x1          = xb[0];
74249b5e25fSSatish Balay     ib          = aj + *ai;
743831a3094SHong Zhang     jmin        = 0;
74498c9bda7SSatish Balay     nonzerorow += (n>0);
7453d9ade75SStefano Zampini     if (n && *ib == i) {            /* (diag of A)*x */
746831a3094SHong Zhang       z[i] += *v++ * x[*ib++]; jmin++;
747831a3094SHong Zhang     }
748eb1ec7c1SStefano Zampini     if (aconj) {
749eb1ec7c1SStefano Zampini       for (j=jmin; j<n; j++) {
750eb1ec7c1SStefano Zampini         cval    = *ib;
751eb1ec7c1SStefano Zampini         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
752eb1ec7c1SStefano Zampini         z[i]    += *v++ * x[*ib++];    /* (strict upper triangular part of A)*x  */
753eb1ec7c1SStefano Zampini       }
754eb1ec7c1SStefano Zampini     } else {
755831a3094SHong Zhang       for (j=jmin; j<n; j++) {
75649b5e25fSSatish Balay         cval    = *ib;
75749b5e25fSSatish Balay         z[cval] += *v * x1;         /* (strict lower triangular part of A)*x  */
75849b5e25fSSatish Balay         z[i]    += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
75949b5e25fSSatish Balay       }
760eb1ec7c1SStefano Zampini     }
76149b5e25fSSatish Balay     xb++; ai++;
76249b5e25fSSatish Balay   }
76349b5e25fSSatish Balay 
764d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
765bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
76649b5e25fSSatish Balay 
767dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
76849b5e25fSSatish Balay   PetscFunctionReturn(0);
76949b5e25fSSatish Balay }
77049b5e25fSSatish Balay 
771dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
77249b5e25fSSatish Balay {
77349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
774d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2;
775d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
776d9ca1df4SBarry Smith   const MatScalar   *v;
7776849ba73SBarry Smith   PetscErrorCode    ierr;
778d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
779d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
78098c9bda7SSatish Balay   PetscInt          nonzerorow=0;
78149b5e25fSSatish Balay 
78249b5e25fSSatish Balay   PetscFunctionBegin;
783343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
784c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
785d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7861ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
78749b5e25fSSatish Balay 
78849b5e25fSSatish Balay   v  = a->a;
78949b5e25fSSatish Balay   xb = x;
79049b5e25fSSatish Balay 
79149b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
79249b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
79349b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
79449b5e25fSSatish Balay     ib          = aj + *ai;
795831a3094SHong Zhang     jmin        = 0;
79698c9bda7SSatish Balay     nonzerorow += (n>0);
79759689ffbSStefano Zampini     if (n && *ib == i) {      /* (diag of A)*x */
79849b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
79949b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
800831a3094SHong Zhang       v        += 4; jmin++;
8017fbae186SHong Zhang     }
802444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
803444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
804831a3094SHong Zhang     for (j=jmin; j<n; j++) {
80549b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
80649b5e25fSSatish Balay       cval       = ib[j]*2;
80749b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
80849b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
80949b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
81049b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
81149b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
81249b5e25fSSatish Balay       v        += 4;
81349b5e25fSSatish Balay     }
81449b5e25fSSatish Balay     xb +=2; ai++;
81549b5e25fSSatish Balay   }
816d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
817bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
81849b5e25fSSatish Balay 
819dc0b31edSSatish Balay   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
82049b5e25fSSatish Balay   PetscFunctionReturn(0);
82149b5e25fSSatish Balay }
82249b5e25fSSatish Balay 
823dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
82449b5e25fSSatish Balay {
82549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
826d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3;
827d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
828d9ca1df4SBarry Smith   const MatScalar   *v;
8296849ba73SBarry Smith   PetscErrorCode    ierr;
830d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
831d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
83298c9bda7SSatish Balay   PetscInt          nonzerorow=0;
83349b5e25fSSatish Balay 
83449b5e25fSSatish Balay   PetscFunctionBegin;
835343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
836c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
837d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8381ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
83949b5e25fSSatish Balay 
84049b5e25fSSatish Balay   v  = a->a;
84149b5e25fSSatish Balay   xb = x;
84249b5e25fSSatish Balay 
84349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
84449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
84549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
84649b5e25fSSatish Balay     ib          = aj + *ai;
847831a3094SHong Zhang     jmin        = 0;
84898c9bda7SSatish Balay     nonzerorow += (n>0);
84959689ffbSStefano Zampini     if (n && *ib == i) {     /* (diag of A)*x */
85049b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
85149b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
85249b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
853831a3094SHong Zhang       v        += 9; jmin++;
8547fbae186SHong Zhang     }
855444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
856444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
857831a3094SHong Zhang     for (j=jmin; j<n; j++) {
85849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
85949b5e25fSSatish Balay       cval       = ib[j]*3;
86049b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
86149b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
86249b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
86349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
86449b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
86549b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
86649b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
86749b5e25fSSatish Balay       v        += 9;
86849b5e25fSSatish Balay     }
86949b5e25fSSatish Balay     xb +=3; ai++;
87049b5e25fSSatish Balay   }
87149b5e25fSSatish Balay 
872d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
873bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
87449b5e25fSSatish Balay 
875dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
87649b5e25fSSatish Balay   PetscFunctionReturn(0);
87749b5e25fSSatish Balay }
87849b5e25fSSatish Balay 
879dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
88049b5e25fSSatish Balay {
88149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
882d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4;
883d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
884d9ca1df4SBarry Smith   const MatScalar   *v;
8856849ba73SBarry Smith   PetscErrorCode    ierr;
886d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
887d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
88898c9bda7SSatish Balay   PetscInt          nonzerorow=0;
88949b5e25fSSatish Balay 
89049b5e25fSSatish Balay   PetscFunctionBegin;
891343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
892c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
893d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8941ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
89549b5e25fSSatish Balay 
89649b5e25fSSatish Balay   v  = a->a;
89749b5e25fSSatish Balay   xb = x;
89849b5e25fSSatish Balay 
89949b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
90049b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
90149b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
90249b5e25fSSatish Balay     ib          = aj + *ai;
903831a3094SHong Zhang     jmin        = 0;
90498c9bda7SSatish Balay     nonzerorow += (n>0);
90559689ffbSStefano Zampini     if (n && *ib == i) {      /* (diag of A)*x */
90649b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
90749b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
90849b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
90949b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
910831a3094SHong Zhang       v        += 16; jmin++;
9117fbae186SHong Zhang     }
912444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
913444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
914831a3094SHong Zhang     for (j=jmin; j<n; j++) {
91549b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
91649b5e25fSSatish Balay       cval       = ib[j]*4;
91749b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
91849b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
91949b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
92049b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
92149b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
92249b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
92349b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
92449b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
92549b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
92649b5e25fSSatish Balay       v        += 16;
92749b5e25fSSatish Balay     }
92849b5e25fSSatish Balay     xb +=4; ai++;
92949b5e25fSSatish Balay   }
93049b5e25fSSatish Balay 
931d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
932bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
93349b5e25fSSatish Balay 
934dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
93549b5e25fSSatish Balay   PetscFunctionReturn(0);
93649b5e25fSSatish Balay }
93749b5e25fSSatish Balay 
938dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
93949b5e25fSSatish Balay {
94049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
941d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5;
942d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
943d9ca1df4SBarry Smith   const MatScalar   *v;
9446849ba73SBarry Smith   PetscErrorCode    ierr;
945d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
946d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
94798c9bda7SSatish Balay   PetscInt          nonzerorow=0;
94849b5e25fSSatish Balay 
94949b5e25fSSatish Balay   PetscFunctionBegin;
950343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
951c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
952d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
9531ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
95449b5e25fSSatish Balay 
95549b5e25fSSatish Balay   v  = a->a;
95649b5e25fSSatish Balay   xb = x;
95749b5e25fSSatish Balay 
95849b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
95949b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
96049b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
96149b5e25fSSatish Balay     ib          = aj + *ai;
962831a3094SHong Zhang     jmin        = 0;
96398c9bda7SSatish Balay     nonzerorow += (n>0);
96459689ffbSStefano Zampini     if (n && *ib == i) {      /* (diag of A)*x */
96549b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
96649b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
96749b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
96849b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
96949b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
970831a3094SHong Zhang       v        += 25; jmin++;
9717fbae186SHong Zhang     }
972444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
973444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
974831a3094SHong Zhang     for (j=jmin; j<n; j++) {
97549b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
97649b5e25fSSatish Balay       cval       = ib[j]*5;
97749b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
97849b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
97949b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
98049b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
98149b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
98249b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
98349b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
98449b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
98549b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
98649b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
98749b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
98849b5e25fSSatish Balay       v        += 25;
98949b5e25fSSatish Balay     }
99049b5e25fSSatish Balay     xb +=5; ai++;
99149b5e25fSSatish Balay   }
99249b5e25fSSatish Balay 
993d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
994bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
99549b5e25fSSatish Balay 
996dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
99749b5e25fSSatish Balay   PetscFunctionReturn(0);
99849b5e25fSSatish Balay }
999c2916339SPierre Jolivet 
1000dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
100149b5e25fSSatish Balay {
100249b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1003d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
1004d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
1005d9ca1df4SBarry Smith   const MatScalar   *v;
10066849ba73SBarry Smith   PetscErrorCode    ierr;
1007d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1008d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
100998c9bda7SSatish Balay   PetscInt          nonzerorow=0;
101049b5e25fSSatish Balay 
101149b5e25fSSatish Balay   PetscFunctionBegin;
1012343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1013c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
1014d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
10151ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
101649b5e25fSSatish Balay 
101749b5e25fSSatish Balay   v  = a->a;
101849b5e25fSSatish Balay   xb = x;
101949b5e25fSSatish Balay 
102049b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
102149b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
102249b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
102349b5e25fSSatish Balay     ib          = aj + *ai;
1024831a3094SHong Zhang     jmin        = 0;
102598c9bda7SSatish Balay     nonzerorow += (n>0);
102659689ffbSStefano Zampini     if (n && *ib == i) {     /* (diag of A)*x */
102749b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
102849b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
102949b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
103049b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
103149b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
103249b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1033831a3094SHong Zhang       v        += 36; jmin++;
10347fbae186SHong Zhang     }
1035444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1036444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1037831a3094SHong Zhang     for (j=jmin; j<n; j++) {
103849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
103949b5e25fSSatish Balay       cval       = ib[j]*6;
104049b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
104149b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
104249b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
104349b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
104449b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
104549b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
104649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
104749b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
104849b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
104949b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
105049b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
105149b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
105249b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
105349b5e25fSSatish Balay       v        += 36;
105449b5e25fSSatish Balay     }
105549b5e25fSSatish Balay     xb +=6; ai++;
105649b5e25fSSatish Balay   }
105749b5e25fSSatish Balay 
1058d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1059bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
106049b5e25fSSatish Balay 
1061dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
106249b5e25fSSatish Balay   PetscFunctionReturn(0);
106349b5e25fSSatish Balay }
106449b5e25fSSatish Balay 
1065dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
106649b5e25fSSatish Balay {
106749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1068d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1069d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
1070d9ca1df4SBarry Smith   const MatScalar   *v;
10716849ba73SBarry Smith   PetscErrorCode    ierr;
1072d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1073d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
107498c9bda7SSatish Balay   PetscInt          nonzerorow=0;
107549b5e25fSSatish Balay 
107649b5e25fSSatish Balay   PetscFunctionBegin;
1077343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1078c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
1079d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
10801ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
108149b5e25fSSatish Balay 
108249b5e25fSSatish Balay   v  = a->a;
108349b5e25fSSatish Balay   xb = x;
108449b5e25fSSatish Balay 
108549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
108649b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
108749b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
108849b5e25fSSatish Balay     ib          = aj + *ai;
1089831a3094SHong Zhang     jmin        = 0;
109098c9bda7SSatish Balay     nonzerorow += (n>0);
109159689ffbSStefano Zampini     if (n && *ib == i) {     /* (diag of A)*x */
109249b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
109349b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
109449b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
109549b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
109649b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
109749b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
109849b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1099831a3094SHong Zhang       v        += 49; jmin++;
11007fbae186SHong Zhang     }
1101444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1102444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1103831a3094SHong Zhang     for (j=jmin; j<n; j++) {
110449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
110549b5e25fSSatish Balay       cval       = ib[j]*7;
110649b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
110749b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
110849b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
110949b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
111049b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
111149b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
111249b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
111349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
111449b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
111549b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
111649b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
111749b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
111849b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
111949b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
112049b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
112149b5e25fSSatish Balay       v       += 49;
112249b5e25fSSatish Balay     }
112349b5e25fSSatish Balay     xb +=7; ai++;
112449b5e25fSSatish Balay   }
112549b5e25fSSatish Balay 
1126d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1127bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
112849b5e25fSSatish Balay 
1129dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
113049b5e25fSSatish Balay   PetscFunctionReturn(0);
113149b5e25fSSatish Balay }
113249b5e25fSSatish Balay 
1133dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
113449b5e25fSSatish Balay {
113549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1136d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1137d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
1138d9ca1df4SBarry Smith   const MatScalar   *v;
1139dfbe8321SBarry Smith   PetscErrorCode    ierr;
1140d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1141d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
114298c9bda7SSatish Balay   PetscInt          nonzerorow=0;
114349b5e25fSSatish Balay 
114449b5e25fSSatish Balay   PetscFunctionBegin;
1145343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1146c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
1147d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
11481ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
114949b5e25fSSatish Balay 
115049b5e25fSSatish Balay   aj = a->j;
115149b5e25fSSatish Balay   v  = a->a;
115249b5e25fSSatish Balay   ii = a->i;
115349b5e25fSSatish Balay 
115449b5e25fSSatish Balay   if (!a->mult_work) {
1155854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
115649b5e25fSSatish Balay   }
115749b5e25fSSatish Balay   work = a->mult_work;
115849b5e25fSSatish Balay 
115949b5e25fSSatish Balay 
116049b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
116149b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
116249b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
116398c9bda7SSatish Balay     nonzerorow += (n>0);
116449b5e25fSSatish Balay 
116549b5e25fSSatish Balay     /* upper triangular part */
116649b5e25fSSatish Balay     for (j=0; j<n; j++) {
116749b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
116849b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
116949b5e25fSSatish Balay       workt += bs;
117049b5e25fSSatish Balay     }
117149b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
117296b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
117349b5e25fSSatish Balay 
117449b5e25fSSatish Balay     /* strict lower triangular part */
1175831a3094SHong Zhang     idx = aj+ii[0];
117659689ffbSStefano Zampini     if (n && *idx == i) {
117796b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
1178831a3094SHong Zhang     }
117949b5e25fSSatish Balay     if (ncols > 0) {
118049b5e25fSSatish Balay       workt = work;
1181580bdb30SBarry Smith       ierr  = PetscArrayzero(workt,ncols);CHKERRQ(ierr);
118296b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1183831a3094SHong Zhang       for (j=0; j<n; j++) {
1184831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
118549b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
118649b5e25fSSatish Balay         workt += bs;
118749b5e25fSSatish Balay       }
118849b5e25fSSatish Balay     }
118949b5e25fSSatish Balay 
119049b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
119149b5e25fSSatish Balay   }
119249b5e25fSSatish Balay 
1193d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1194bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
119549b5e25fSSatish Balay 
1196dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
119749b5e25fSSatish Balay   PetscFunctionReturn(0);
119849b5e25fSSatish Balay }
119949b5e25fSSatish Balay 
1200f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
120149b5e25fSSatish Balay {
120249b5e25fSSatish Balay   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1203f4df32b1SMatthew Knepley   PetscScalar    oalpha = alpha;
1204efee365bSSatish Balay   PetscErrorCode ierr;
1205c5df96a5SBarry Smith   PetscBLASInt   one = 1,totalnz;
120649b5e25fSSatish Balay 
120749b5e25fSSatish Balay   PetscFunctionBegin;
1208c5df96a5SBarry Smith   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
12098b83055fSJed Brown   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1210efee365bSSatish Balay   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
121149b5e25fSSatish Balay   PetscFunctionReturn(0);
121249b5e25fSSatish Balay }
121349b5e25fSSatish Balay 
1214dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
121549b5e25fSSatish Balay {
121649b5e25fSSatish Balay   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1217d9ca1df4SBarry Smith   const MatScalar *v       = a->a;
121849b5e25fSSatish Balay   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1219d9ca1df4SBarry Smith   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
12206849ba73SBarry Smith   PetscErrorCode  ierr;
1221d9ca1df4SBarry Smith   const PetscInt  *aj=a->j,*col;
122249b5e25fSSatish Balay 
122349b5e25fSSatish Balay   PetscFunctionBegin;
1224c40ae873SPierre Jolivet   if (!a->nz) {
1225c40ae873SPierre Jolivet     *norm = 0.0;
1226c40ae873SPierre Jolivet     PetscFunctionReturn(0);
1227c40ae873SPierre Jolivet   }
122849b5e25fSSatish Balay   if (type == NORM_FROBENIUS) {
122949b5e25fSSatish Balay     for (k=0; k<mbs; k++) {
123059689ffbSStefano Zampini       jmin = a->i[k];
123159689ffbSStefano Zampini       jmax = a->i[k+1];
1232831a3094SHong Zhang       col  = aj + jmin;
123359689ffbSStefano Zampini       if (jmax-jmin > 0 && *col == k) {         /* diagonal block */
123449b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
123549b5e25fSSatish Balay           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
123649b5e25fSSatish Balay         }
1237831a3094SHong Zhang         jmin++;
1238831a3094SHong Zhang       }
1239831a3094SHong Zhang       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
124049b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
124149b5e25fSSatish Balay           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
124249b5e25fSSatish Balay         }
124349b5e25fSSatish Balay       }
124449b5e25fSSatish Balay     }
12458f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
124651f70360SJed Brown     ierr = PetscLogFlops(2*bs2*a->nz);CHKERRQ(ierr);
12470b8dc8d2SHong Zhang   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1248dcca6d9dSJed Brown     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
12490b8dc8d2SHong Zhang     for (i=0; i<mbs; i++) jl[i] = mbs;
12500b8dc8d2SHong Zhang     il[0] = 0;
125149b5e25fSSatish Balay 
125249b5e25fSSatish Balay     *norm = 0.0;
125349b5e25fSSatish Balay     for (k=0; k<mbs; k++) { /* k_th block row */
125449b5e25fSSatish Balay       for (j=0; j<bs; j++) sum[j]=0.0;
125549b5e25fSSatish Balay       /*-- col sum --*/
125649b5e25fSSatish Balay       i = jl[k]; /* first |A(i,k)| to be added */
1257831a3094SHong Zhang       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1258831a3094SHong Zhang                   at step k */
125949b5e25fSSatish Balay       while (i<mbs) {
126049b5e25fSSatish Balay         nexti = jl[i];  /* next block row to be added */
126149b5e25fSSatish Balay         ik    = il[i];  /* block index of A(i,k) in the array a */
126249b5e25fSSatish Balay         for (j=0; j<bs; j++) {
126349b5e25fSSatish Balay           v = a->a + ik*bs2 + j*bs;
126449b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
126549b5e25fSSatish Balay             sum[j] += PetscAbsScalar(*v); v++;
126649b5e25fSSatish Balay           }
126749b5e25fSSatish Balay         }
126849b5e25fSSatish Balay         /* update il, jl */
1269831a3094SHong Zhang         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1270831a3094SHong Zhang         jmax = a->i[i+1];
127149b5e25fSSatish Balay         if (jmin < jmax) {
127249b5e25fSSatish Balay           il[i] = jmin;
127349b5e25fSSatish Balay           j     = a->j[jmin];
127449b5e25fSSatish Balay           jl[i] = jl[j]; jl[j]=i;
127549b5e25fSSatish Balay         }
127649b5e25fSSatish Balay         i = nexti;
127749b5e25fSSatish Balay       }
127849b5e25fSSatish Balay       /*-- row sum --*/
127959689ffbSStefano Zampini       jmin = a->i[k];
128059689ffbSStefano Zampini       jmax = a->i[k+1];
128149b5e25fSSatish Balay       for (i=jmin; i<jmax; i++) {
128249b5e25fSSatish Balay         for (j=0; j<bs; j++) {
128349b5e25fSSatish Balay           v = a->a + i*bs2 + j;
128449b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
12850b8dc8d2SHong Zhang             sum[j] += PetscAbsScalar(*v); v += bs;
128649b5e25fSSatish Balay           }
128749b5e25fSSatish Balay         }
128849b5e25fSSatish Balay       }
128949b5e25fSSatish Balay       /* add k_th block row to il, jl */
1290831a3094SHong Zhang       col = aj+jmin;
129159689ffbSStefano Zampini       if (jmax - jmin > 0 && *col == k) jmin++;
129249b5e25fSSatish Balay       if (jmin < jmax) {
129349b5e25fSSatish Balay         il[k] = jmin;
12940b8dc8d2SHong Zhang         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
129549b5e25fSSatish Balay       }
129649b5e25fSSatish Balay       for (j=0; j<bs; j++) {
129749b5e25fSSatish Balay         if (sum[j] > *norm) *norm = sum[j];
129849b5e25fSSatish Balay       }
129949b5e25fSSatish Balay     }
130074ed9c26SBarry Smith     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
130151f70360SJed Brown     ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr);
1302f23aa3ddSBarry Smith   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
130349b5e25fSSatish Balay   PetscFunctionReturn(0);
130449b5e25fSSatish Balay }
130549b5e25fSSatish Balay 
1306ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
130749b5e25fSSatish Balay {
130849b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1309dfbe8321SBarry Smith   PetscErrorCode ierr;
131049b5e25fSSatish Balay 
131149b5e25fSSatish Balay   PetscFunctionBegin;
131249b5e25fSSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1313d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1314ef511fbeSHong Zhang     *flg = PETSC_FALSE;
1315ef511fbeSHong Zhang     PetscFunctionReturn(0);
131649b5e25fSSatish Balay   }
131749b5e25fSSatish Balay 
131849b5e25fSSatish Balay   /* if the a->i are the same */
1319580bdb30SBarry Smith   ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr);
132026fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
132149b5e25fSSatish Balay 
132249b5e25fSSatish Balay   /* if a->j are the same */
1323580bdb30SBarry Smith   ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr);
132426fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
132526fbe8dcSKarl Rupp 
132649b5e25fSSatish Balay   /* if a->a are the same */
1327580bdb30SBarry Smith   ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg);CHKERRQ(ierr);
1328935af2e7SHong Zhang   PetscFunctionReturn(0);
132949b5e25fSSatish Balay }
133049b5e25fSSatish Balay 
1331dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
133249b5e25fSSatish Balay {
133349b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1334dfbe8321SBarry Smith   PetscErrorCode  ierr;
1335d9ca1df4SBarry Smith   PetscInt        i,j,k,row,bs,ambs,bs2;
1336d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
133787828ca2SBarry Smith   PetscScalar     *x,zero = 0.0;
1338d9ca1df4SBarry Smith   const MatScalar *aa,*aa_j;
133949b5e25fSSatish Balay 
134049b5e25fSSatish Balay   PetscFunctionBegin;
1341d0f46423SBarry Smith   bs = A->rmap->bs;
1342e32f2f54SBarry Smith   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
134382799104SHong Zhang 
134449b5e25fSSatish Balay   aa   = a->a;
13458a0c6efdSHong Zhang   ambs = a->mbs;
13468a0c6efdSHong Zhang 
13478a0c6efdSHong Zhang   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
13488a0c6efdSHong Zhang     PetscInt *diag=a->diag;
13498a0c6efdSHong Zhang     aa   = a->a;
13508a0c6efdSHong Zhang     ambs = a->mbs;
13518a0c6efdSHong Zhang     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
13528a0c6efdSHong Zhang     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
13538a0c6efdSHong Zhang     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
13548a0c6efdSHong Zhang     PetscFunctionReturn(0);
13558a0c6efdSHong Zhang   }
13568a0c6efdSHong Zhang 
135749b5e25fSSatish Balay   ai   = a->i;
135849b5e25fSSatish Balay   aj   = a->j;
135949b5e25fSSatish Balay   bs2  = a->bs2;
13602dcb1b2aSMatthew Knepley   ierr = VecSet(v,zero);CHKERRQ(ierr);
1361c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
13621ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
136349b5e25fSSatish Balay   for (i=0; i<ambs; i++) {
136449b5e25fSSatish Balay     j = ai[i];
136549b5e25fSSatish Balay     if (aj[j] == i) {    /* if this is a diagonal element */
136649b5e25fSSatish Balay       row  = i*bs;
136749b5e25fSSatish Balay       aa_j = aa + j*bs2;
136849b5e25fSSatish Balay       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
136949b5e25fSSatish Balay     }
137049b5e25fSSatish Balay   }
13711ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
137249b5e25fSSatish Balay   PetscFunctionReturn(0);
137349b5e25fSSatish Balay }
137449b5e25fSSatish Balay 
1375dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
137649b5e25fSSatish Balay {
137749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1378d9ca1df4SBarry Smith   PetscScalar       x;
1379d9ca1df4SBarry Smith   const PetscScalar *l,*li,*ri;
138049b5e25fSSatish Balay   MatScalar         *aa,*v;
1381dfbe8321SBarry Smith   PetscErrorCode    ierr;
1382fff8e43fSBarry Smith   PetscInt          i,j,k,lm,M,m,mbs,tmp,bs,bs2;
1383fff8e43fSBarry Smith   const PetscInt    *ai,*aj;
1384ace3abfcSBarry Smith   PetscBool         flg;
138549b5e25fSSatish Balay 
138649b5e25fSSatish Balay   PetscFunctionBegin;
1387b3bf805bSHong Zhang   if (ll != rr) {
1388b3bf805bSHong Zhang     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1389e7e72b3dSBarry Smith     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1390b3bf805bSHong Zhang   }
1391b3bf805bSHong Zhang   if (!ll) PetscFunctionReturn(0);
139249b5e25fSSatish Balay   ai  = a->i;
139349b5e25fSSatish Balay   aj  = a->j;
139449b5e25fSSatish Balay   aa  = a->a;
1395d0f46423SBarry Smith   m   = A->rmap->N;
1396d0f46423SBarry Smith   bs  = A->rmap->bs;
139749b5e25fSSatish Balay   mbs = a->mbs;
139849b5e25fSSatish Balay   bs2 = a->bs2;
139949b5e25fSSatish Balay 
1400d9ca1df4SBarry Smith   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
140149b5e25fSSatish Balay   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1402e32f2f54SBarry Smith   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
140349b5e25fSSatish Balay   for (i=0; i<mbs; i++) { /* for each block row */
140449b5e25fSSatish Balay     M  = ai[i+1] - ai[i];
140549b5e25fSSatish Balay     li = l + i*bs;
140649b5e25fSSatish Balay     v  = aa + bs2*ai[i];
140749b5e25fSSatish Balay     for (j=0; j<M; j++) { /* for each block */
140849b5e25fSSatish Balay       ri = l + bs*aj[ai[i]+j];
14095e90f9d9SHong Zhang       for (k=0; k<bs; k++) {
141049b5e25fSSatish Balay         x = ri[k];
141149b5e25fSSatish Balay         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
141249b5e25fSSatish Balay       }
141349b5e25fSSatish Balay     }
141449b5e25fSSatish Balay   }
1415d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1416dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
141749b5e25fSSatish Balay   PetscFunctionReturn(0);
141849b5e25fSSatish Balay }
141949b5e25fSSatish Balay 
1420dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
142149b5e25fSSatish Balay {
142249b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
142349b5e25fSSatish Balay 
142449b5e25fSSatish Balay   PetscFunctionBegin;
142549b5e25fSSatish Balay   info->block_size   = a->bs2;
1426ceed8ce5SJed Brown   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
14276c6c5352SBarry Smith   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
14283966268fSBarry Smith   info->nz_unneeded  = info->nz_allocated - info->nz_used;
142949b5e25fSSatish Balay   info->assemblies   = A->num_ass;
14308e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
14317adad957SLisandro Dalcin   info->memory       = ((PetscObject)A)->mem;
1432d5f3da31SBarry Smith   if (A->factortype) {
143349b5e25fSSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
143449b5e25fSSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
143549b5e25fSSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
143649b5e25fSSatish Balay   } else {
143749b5e25fSSatish Balay     info->fill_ratio_given  = 0;
143849b5e25fSSatish Balay     info->fill_ratio_needed = 0;
143949b5e25fSSatish Balay     info->factor_mallocs    = 0;
144049b5e25fSSatish Balay   }
144149b5e25fSSatish Balay   PetscFunctionReturn(0);
144249b5e25fSSatish Balay }
144349b5e25fSSatish Balay 
1444dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
144549b5e25fSSatish Balay {
144649b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1447dfbe8321SBarry Smith   PetscErrorCode ierr;
144849b5e25fSSatish Balay 
144949b5e25fSSatish Balay   PetscFunctionBegin;
1450580bdb30SBarry Smith   ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr);
145149b5e25fSSatish Balay   PetscFunctionReturn(0);
145249b5e25fSSatish Balay }
1453dc354874SHong Zhang 
1454985db425SBarry Smith /*
1455985db425SBarry Smith    This code does not work since it only checks the upper triangular part of
1456985db425SBarry Smith   the matrix. Hence it is not listed in the function table.
1457985db425SBarry Smith */
1458985db425SBarry Smith PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1459dc354874SHong Zhang {
1460dc354874SHong Zhang   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1461dfbe8321SBarry Smith   PetscErrorCode  ierr;
1462d9ca1df4SBarry Smith   PetscInt        i,j,n,row,col,bs,mbs;
1463d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
1464c3fca9a7SHong Zhang   PetscReal       atmp;
1465d9ca1df4SBarry Smith   const MatScalar *aa;
1466985db425SBarry Smith   PetscScalar     *x;
146713f74950SBarry Smith   PetscInt        ncols,brow,bcol,krow,kcol;
1468dc354874SHong Zhang 
1469dc354874SHong Zhang   PetscFunctionBegin;
1470e32f2f54SBarry Smith   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1471e32f2f54SBarry Smith   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1472d0f46423SBarry Smith   bs  = A->rmap->bs;
1473dc354874SHong Zhang   aa  = a->a;
1474dc354874SHong Zhang   ai  = a->i;
1475dc354874SHong Zhang   aj  = a->j;
147644117c81SHong Zhang   mbs = a->mbs;
1477dc354874SHong Zhang 
1478985db425SBarry Smith   ierr = VecSet(v,0.0);CHKERRQ(ierr);
14791ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1480dc354874SHong Zhang   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1481e32f2f54SBarry Smith   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
148244117c81SHong Zhang   for (i=0; i<mbs; i++) {
1483d0f6400bSHong Zhang     ncols = ai[1] - ai[0]; ai++;
1484d0f6400bSHong Zhang     brow  = bs*i;
148544117c81SHong Zhang     for (j=0; j<ncols; j++) {
1486d0f6400bSHong Zhang       bcol = bs*(*aj);
148744117c81SHong Zhang       for (kcol=0; kcol<bs; kcol++) {
1488d0f6400bSHong Zhang         col = bcol + kcol;      /* col index */
148944117c81SHong Zhang         for (krow=0; krow<bs; krow++) {
1490d0f6400bSHong Zhang           atmp = PetscAbsScalar(*aa); aa++;
1491d0f6400bSHong Zhang           row  = brow + krow;   /* row index */
1492c3fca9a7SHong Zhang           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1493c3fca9a7SHong Zhang           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
149444117c81SHong Zhang         }
149544117c81SHong Zhang       }
1496d0f6400bSHong Zhang       aj++;
1497dc354874SHong Zhang     }
1498dc354874SHong Zhang   }
14991ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1500dc354874SHong Zhang   PetscFunctionReturn(0);
1501dc354874SHong Zhang }
1502c2916339SPierre Jolivet 
15034222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1504c2916339SPierre Jolivet {
1505c2916339SPierre Jolivet   PetscErrorCode ierr;
1506c2916339SPierre Jolivet 
1507c2916339SPierre Jolivet   PetscFunctionBegin;
1508c2916339SPierre Jolivet   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
15094222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1510c2916339SPierre Jolivet   PetscFunctionReturn(0);
1511c2916339SPierre Jolivet }
1512c2916339SPierre Jolivet 
1513c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1514c2916339SPierre Jolivet {
1515c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1516c2916339SPierre Jolivet   PetscScalar       *z = c;
1517c2916339SPierre Jolivet   const PetscScalar *xb;
1518c2916339SPierre Jolivet   PetscScalar       x1;
1519c2916339SPierre Jolivet   const MatScalar   *v = a->a,*vv;
1520c2916339SPierre Jolivet   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
15213c2e41e1SStefano Zampini #if defined(PETSC_USE_COMPLEX)
15223c2e41e1SStefano Zampini   const int         aconj = A->hermitian;
15233c2e41e1SStefano Zampini #else
15243c2e41e1SStefano Zampini   const int         aconj = 0;
15253c2e41e1SStefano Zampini #endif
1526c2916339SPierre Jolivet 
1527c2916339SPierre Jolivet   PetscFunctionBegin;
1528c2916339SPierre Jolivet   for (i=0; i<mbs; i++) {
1529c2916339SPierre Jolivet     n = ii[1] - ii[0]; ii++;
1530c2916339SPierre Jolivet     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1531c2916339SPierre Jolivet     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1532c2916339SPierre Jolivet     jj = idx;
1533c2916339SPierre Jolivet     vv = v;
1534c2916339SPierre Jolivet     for (k=0; k<cn; k++) {
1535c2916339SPierre Jolivet       idx = jj;
1536c2916339SPierre Jolivet       v = vv;
1537c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1538c2916339SPierre Jolivet         xb = b + (*idx); x1 = xb[0+k*bm];
1539c2916339SPierre Jolivet         z[0+k*cm] += v[0]*x1;
15403c2e41e1SStefano Zampini         if (*idx != i) c[(*idx)+k*cm] += (aconj ? PetscConj(v[0]) : v[0])*b[i+k*bm];
1541c2916339SPierre Jolivet         v += 1;
1542c2916339SPierre Jolivet         ++idx;
1543c2916339SPierre Jolivet       }
1544c2916339SPierre Jolivet     }
1545c2916339SPierre Jolivet     z += 1;
1546c2916339SPierre Jolivet   }
1547c2916339SPierre Jolivet   PetscFunctionReturn(0);
1548c2916339SPierre Jolivet }
1549c2916339SPierre Jolivet 
1550c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1551c2916339SPierre Jolivet {
1552c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1553c2916339SPierre Jolivet   PetscScalar       *z = c;
1554c2916339SPierre Jolivet   const PetscScalar *xb;
1555c2916339SPierre Jolivet   PetscScalar       x1,x2;
1556c2916339SPierre Jolivet   const MatScalar   *v = a->a,*vv;
1557c2916339SPierre Jolivet   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1558c2916339SPierre Jolivet 
1559c2916339SPierre Jolivet   PetscFunctionBegin;
1560c2916339SPierre Jolivet   for (i=0; i<mbs; i++) {
1561c2916339SPierre Jolivet     n = ii[1] - ii[0]; ii++;
1562c2916339SPierre Jolivet     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1563c2916339SPierre Jolivet     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1564c2916339SPierre Jolivet     jj = idx;
1565c2916339SPierre Jolivet     vv = v;
1566c2916339SPierre Jolivet     for (k=0; k<cn; k++) {
1567c2916339SPierre Jolivet       idx = jj;
1568c2916339SPierre Jolivet       v = vv;
1569c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1570c2916339SPierre Jolivet         xb    = b + 2*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
1571c2916339SPierre Jolivet         z[0+k*cm] += v[0]*x1 + v[2]*x2;
1572c2916339SPierre Jolivet         z[1+k*cm] += v[1]*x1 + v[3]*x2;
1573c2916339SPierre Jolivet         if (*idx != i) {
1574c2916339SPierre Jolivet           c[2*(*idx)+0+k*cm] += v[0]*b[2*i+k*bm] + v[1]*b[2*i+1+k*bm];
1575c2916339SPierre Jolivet           c[2*(*idx)+1+k*cm] += v[2]*b[2*i+k*bm] + v[3]*b[2*i+1+k*bm];
1576c2916339SPierre Jolivet         }
1577c2916339SPierre Jolivet         v += 4;
1578c2916339SPierre Jolivet         ++idx;
1579c2916339SPierre Jolivet       }
1580c2916339SPierre Jolivet     }
1581c2916339SPierre Jolivet     z += 2;
1582c2916339SPierre Jolivet   }
1583c2916339SPierre Jolivet   PetscFunctionReturn(0);
1584c2916339SPierre Jolivet }
1585c2916339SPierre Jolivet 
1586c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1587c2916339SPierre Jolivet {
1588c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1589c2916339SPierre Jolivet   PetscScalar       *z = c;
1590c2916339SPierre Jolivet   const PetscScalar *xb;
1591c2916339SPierre Jolivet   PetscScalar       x1,x2,x3;
1592c2916339SPierre Jolivet   const MatScalar   *v = a->a,*vv;
1593c2916339SPierre Jolivet   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1594c2916339SPierre Jolivet 
1595c2916339SPierre Jolivet   PetscFunctionBegin;
1596c2916339SPierre Jolivet   for (i=0; i<mbs; i++) {
1597c2916339SPierre Jolivet     n = ii[1] - ii[0]; ii++;
1598c2916339SPierre Jolivet     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1599c2916339SPierre Jolivet     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1600c2916339SPierre Jolivet     jj = idx;
1601c2916339SPierre Jolivet     vv = v;
1602c2916339SPierre Jolivet     for (k=0; k<cn; k++) {
1603c2916339SPierre Jolivet       idx = jj;
1604c2916339SPierre Jolivet       v = vv;
1605c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1606c2916339SPierre Jolivet         xb = b + 3*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
1607c2916339SPierre Jolivet         z[0+k*cm] += v[0]*x1 + v[3]*x2 + v[6]*x3;
1608c2916339SPierre Jolivet         z[1+k*cm] += v[1]*x1 + v[4]*x2 + v[7]*x3;
1609c2916339SPierre Jolivet         z[2+k*cm] += v[2]*x1 + v[5]*x2 + v[8]*x3;
1610c2916339SPierre Jolivet         if (*idx != i) {
1611c2916339SPierre Jolivet           c[3*(*idx)+0+k*cm] += v[0]*b[3*i+k*bm] + v[3]*b[3*i+1+k*bm] + v[6]*b[3*i+2+k*bm];
1612c2916339SPierre Jolivet           c[3*(*idx)+1+k*cm] += v[1]*b[3*i+k*bm] + v[4]*b[3*i+1+k*bm] + v[7]*b[3*i+2+k*bm];
1613c2916339SPierre Jolivet           c[3*(*idx)+2+k*cm] += v[2]*b[3*i+k*bm] + v[5]*b[3*i+1+k*bm] + v[8]*b[3*i+2+k*bm];
1614c2916339SPierre Jolivet         }
1615c2916339SPierre Jolivet         v += 9;
1616c2916339SPierre Jolivet         ++idx;
1617c2916339SPierre Jolivet       }
1618c2916339SPierre Jolivet     }
1619c2916339SPierre Jolivet     z += 3;
1620c2916339SPierre Jolivet   }
1621c2916339SPierre Jolivet   PetscFunctionReturn(0);
1622c2916339SPierre Jolivet }
1623c2916339SPierre Jolivet 
1624c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1625c2916339SPierre Jolivet {
1626c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1627c2916339SPierre Jolivet   PetscScalar       *z = c;
1628c2916339SPierre Jolivet   const PetscScalar *xb;
1629c2916339SPierre Jolivet   PetscScalar       x1,x2,x3,x4;
1630c2916339SPierre Jolivet   const MatScalar   *v = a->a,*vv;
1631c2916339SPierre Jolivet   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1632c2916339SPierre Jolivet 
1633c2916339SPierre Jolivet   PetscFunctionBegin;
1634c2916339SPierre Jolivet   for (i=0; i<mbs; i++) {
1635c2916339SPierre Jolivet     n = ii[1] - ii[0]; ii++;
1636c2916339SPierre Jolivet     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1637c2916339SPierre Jolivet     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1638c2916339SPierre Jolivet     jj = idx;
1639c2916339SPierre Jolivet     vv = v;
1640c2916339SPierre Jolivet     for (k=0; k<cn; k++) {
1641c2916339SPierre Jolivet       idx = jj;
1642c2916339SPierre Jolivet       v = vv;
1643c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1644c2916339SPierre Jolivet         xb = b + 4*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
1645c2916339SPierre Jolivet         z[0+k*cm] += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1646c2916339SPierre Jolivet         z[1+k*cm] += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1647c2916339SPierre Jolivet         z[2+k*cm] += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1648c2916339SPierre Jolivet         z[3+k*cm] += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1649c2916339SPierre Jolivet         if (*idx != i) {
1650c2916339SPierre Jolivet           c[4*(*idx)+0+k*cm] += v[0]*b[4*i+k*bm] + v[4]*b[4*i+1+k*bm] + v[8]*b[4*i+2+k*bm]  + v[12]*b[4*i+3+k*bm];
1651c2916339SPierre Jolivet           c[4*(*idx)+1+k*cm] += v[1]*b[4*i+k*bm] + v[5]*b[4*i+1+k*bm] + v[9]*b[4*i+2+k*bm]  + v[13]*b[4*i+3+k*bm];
1652c2916339SPierre Jolivet           c[4*(*idx)+2+k*cm] += v[2]*b[4*i+k*bm] + v[6]*b[4*i+1+k*bm] + v[10]*b[4*i+2+k*bm] + v[14]*b[4*i+3+k*bm];
1653c2916339SPierre Jolivet           c[4*(*idx)+3+k*cm] += v[3]*b[4*i+k*bm] + v[7]*b[4*i+1+k*bm] + v[11]*b[4*i+2+k*bm] + v[15]*b[4*i+3+k*bm];
1654c2916339SPierre Jolivet         }
1655c2916339SPierre Jolivet         v += 16;
1656c2916339SPierre Jolivet         ++idx;
1657c2916339SPierre Jolivet       }
1658c2916339SPierre Jolivet     }
1659c2916339SPierre Jolivet     z += 4;
1660c2916339SPierre Jolivet   }
1661c2916339SPierre Jolivet   PetscFunctionReturn(0);
1662c2916339SPierre Jolivet }
1663c2916339SPierre Jolivet 
1664c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1665c2916339SPierre Jolivet {
1666c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1667c2916339SPierre Jolivet   PetscScalar       *z = c;
1668c2916339SPierre Jolivet   const PetscScalar *xb;
1669c2916339SPierre Jolivet   PetscScalar       x1,x2,x3,x4,x5;
1670c2916339SPierre Jolivet   const MatScalar   *v = a->a,*vv;
1671c2916339SPierre Jolivet   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1672c2916339SPierre Jolivet 
1673c2916339SPierre Jolivet   PetscFunctionBegin;
1674c2916339SPierre Jolivet   for (i=0; i<mbs; i++) {
1675c2916339SPierre Jolivet     n = ii[1] - ii[0]; ii++;
1676c2916339SPierre Jolivet     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1677c2916339SPierre Jolivet     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1678c2916339SPierre Jolivet     jj = idx;
1679c2916339SPierre Jolivet     vv = v;
1680c2916339SPierre Jolivet     for (k=0; k<cn; k++) {
1681c2916339SPierre Jolivet       idx = jj;
1682c2916339SPierre Jolivet       v = vv;
1683c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1684c2916339SPierre Jolivet         xb = b + 5*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*cm];
1685c2916339SPierre Jolivet         z[0+k*cm] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1686c2916339SPierre Jolivet         z[1+k*cm] += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1687c2916339SPierre Jolivet         z[2+k*cm] += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1688c2916339SPierre Jolivet         z[3+k*cm] += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1689c2916339SPierre Jolivet         z[4+k*cm] += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1690c2916339SPierre Jolivet         if (*idx != i) {
1691c2916339SPierre Jolivet           c[5*(*idx)+0+k*cm] += v[0]*b[5*i+k*bm] + v[5]*b[5*i+1+k*bm] + v[10]*b[5*i+2+k*bm] + v[15]*b[5*i+3+k*bm] + v[20]*b[5*i+4+k*bm];
1692c2916339SPierre Jolivet           c[5*(*idx)+1+k*cm] += v[1]*b[5*i+k*bm] + v[6]*b[5*i+1+k*bm] + v[11]*b[5*i+2+k*bm] + v[16]*b[5*i+3+k*bm] + v[21]*b[5*i+4+k*bm];
1693c2916339SPierre Jolivet           c[5*(*idx)+2+k*cm] += v[2]*b[5*i+k*bm] + v[7]*b[5*i+1+k*bm] + v[12]*b[5*i+2+k*bm] + v[17]*b[5*i+3+k*bm] + v[22]*b[5*i+4+k*bm];
1694c2916339SPierre Jolivet           c[5*(*idx)+3+k*cm] += v[3]*b[5*i+k*bm] + v[8]*b[5*i+1+k*bm] + v[13]*b[5*i+2+k*bm] + v[18]*b[5*i+3+k*bm] + v[23]*b[5*i+4+k*bm];
1695c2916339SPierre Jolivet           c[5*(*idx)+4+k*cm] += v[4]*b[5*i+k*bm] + v[9]*b[5*i+1+k*bm] + v[14]*b[5*i+2+k*bm] + v[19]*b[5*i+3+k*bm] + v[24]*b[5*i+4+k*bm];
1696c2916339SPierre Jolivet         }
1697c2916339SPierre Jolivet         v += 25;
1698c2916339SPierre Jolivet         ++idx;
1699c2916339SPierre Jolivet       }
1700c2916339SPierre Jolivet     }
1701c2916339SPierre Jolivet     z += 5;
1702c2916339SPierre Jolivet   }
1703c2916339SPierre Jolivet   PetscFunctionReturn(0);
1704c2916339SPierre Jolivet }
1705c2916339SPierre Jolivet 
1706c2916339SPierre Jolivet PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C)
1707c2916339SPierre Jolivet {
1708c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1709c2916339SPierre Jolivet   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
1710281439baSStefano Zampini   Mat_SeqDense      *cd = (Mat_SeqDense*)C->data;
1711c2916339SPierre Jolivet   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
1712c2916339SPierre Jolivet   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1713c2916339SPierre Jolivet   PetscBLASInt      bbs,bcn,bbm,bcm;
1714c2916339SPierre Jolivet   PetscScalar       *z = 0;
1715c2916339SPierre Jolivet   PetscScalar       *c,*b;
1716c2916339SPierre Jolivet   const MatScalar   *v;
1717c2916339SPierre Jolivet   const PetscInt    *idx,*ii;
1718c2916339SPierre Jolivet   PetscScalar       _DOne=1.0;
1719c2916339SPierre Jolivet   PetscErrorCode    ierr;
1720c2916339SPierre Jolivet 
1721c2916339SPierre Jolivet   PetscFunctionBegin;
1722c2916339SPierre Jolivet   if (!cm || !cn) PetscFunctionReturn(0);
1723c2916339SPierre Jolivet   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
1724c2916339SPierre Jolivet   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1725c2916339SPierre Jolivet   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1726c2916339SPierre Jolivet   b = bd->v;
1727c2916339SPierre Jolivet   ierr = MatZeroEntries(C);CHKERRQ(ierr);
1728c2916339SPierre Jolivet   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1729c2916339SPierre Jolivet   switch (bs) {
1730c2916339SPierre Jolivet   case 1:
1731c2916339SPierre Jolivet     ierr = MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn);
1732c2916339SPierre Jolivet     break;
1733c2916339SPierre Jolivet   case 2:
1734c2916339SPierre Jolivet     ierr = MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn);
1735c2916339SPierre Jolivet     break;
1736c2916339SPierre Jolivet   case 3:
1737c2916339SPierre Jolivet     ierr = MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn);
1738c2916339SPierre Jolivet     break;
1739c2916339SPierre Jolivet   case 4:
1740c2916339SPierre Jolivet     ierr = MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn);
1741c2916339SPierre Jolivet     break;
1742c2916339SPierre Jolivet   case 5:
1743c2916339SPierre Jolivet     ierr = MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn);
1744c2916339SPierre Jolivet     break;
1745c2916339SPierre Jolivet   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1746c2916339SPierre Jolivet     ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr);
1747c2916339SPierre Jolivet     ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr);
1748c2916339SPierre Jolivet     ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr);
1749c2916339SPierre Jolivet     ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr);
1750c2916339SPierre Jolivet     idx = a->j;
1751c2916339SPierre Jolivet     v   = a->a;
1752c2916339SPierre Jolivet     mbs = a->mbs;
1753c2916339SPierre Jolivet     ii  = a->i;
1754c2916339SPierre Jolivet     z   = c;
1755c2916339SPierre Jolivet     for (i=0; i<mbs; i++) {
1756c2916339SPierre Jolivet       n = ii[1] - ii[0]; ii++;
1757c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1758*6718818eSStefano Zampini         if (*idx != i) PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*i,&bbm,&_DOne,c+bs*(*idx),&bcm));
1759c2916339SPierre Jolivet         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
1760c2916339SPierre Jolivet         v += bs2;
1761c2916339SPierre Jolivet       }
1762c2916339SPierre Jolivet       z += bs;
1763c2916339SPierre Jolivet     }
1764c2916339SPierre Jolivet   }
1765c2916339SPierre Jolivet   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1766c2916339SPierre Jolivet   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1767c2916339SPierre Jolivet   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1768c2916339SPierre Jolivet   ierr = PetscLogFlops((2.0*(a->nz*2.0 - a->nonzerorowcnt)*bs2 - a->nonzerorowcnt)*cn);CHKERRQ(ierr);
1769c2916339SPierre Jolivet   PetscFunctionReturn(0);
1770c2916339SPierre Jolivet }
1771