xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision fff8e43f20a4eb7b6688edfb8395c542b46130d2)
149b5e25fSSatish Balay 
2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
3af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
4c6db04a5SJed Brown #include <petscbt.h>
5c6db04a5SJed Brown #include <../src/mat/impls/sbaij/seq/sbaij.h>
6c6db04a5SJed Brown #include <petscblaslapack.h>
749b5e25fSSatish Balay 
813f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
949b5e25fSSatish Balay {
105eee224dSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
116849ba73SBarry Smith   PetscErrorCode ierr;
125d0c19d7SBarry Smith   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
135d0c19d7SBarry Smith   const PetscInt *idx;
14db41cccfSHong Zhang   PetscBT        table_out,table_in;
15d94109b8SHong Zhang 
16d94109b8SHong Zhang   PetscFunctionBegin;
17e32f2f54SBarry Smith   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
18d94109b8SHong Zhang   mbs  = a->mbs;
19d94109b8SHong Zhang   ai   = a->i;
20d94109b8SHong Zhang   aj   = a->j;
21d0f46423SBarry Smith   bs   = A->rmap->bs;
2253b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
23854ce69bSBarry Smith   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
24854ce69bSBarry Smith   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
2553b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
26d94109b8SHong Zhang 
27d94109b8SHong Zhang   for (i=0; i<is_max; i++) { /* for each is */
28d94109b8SHong Zhang     isz  = 0;
29db41cccfSHong Zhang     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
30d94109b8SHong Zhang 
31d94109b8SHong Zhang     /* Extract the indices, assume there can be duplicate entries */
32d94109b8SHong Zhang     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
33d94109b8SHong Zhang     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
34d94109b8SHong Zhang 
35db41cccfSHong Zhang     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
36dbe03f88SHong Zhang     bcol_max = 0;
37d94109b8SHong Zhang     for (j=0; j<n; ++j) {
38d94109b8SHong Zhang       brow = idx[j]/bs; /* convert the indices into block indices */
39e32f2f54SBarry Smith       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
40db41cccfSHong Zhang       if (!PetscBTLookupSet(table_out,brow)) {
41dbe03f88SHong Zhang         nidx[isz++] = brow;
42dbe03f88SHong Zhang         if (bcol_max < brow) bcol_max = brow;
43dbe03f88SHong Zhang       }
44d94109b8SHong Zhang     }
45d94109b8SHong Zhang     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
466bf464f9SBarry Smith     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
47d94109b8SHong Zhang 
48d94109b8SHong Zhang     k = 0;
49d94109b8SHong Zhang     for (j=0; j<ov; j++) { /* for each overlap */
50db41cccfSHong Zhang       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
51db41cccfSHong Zhang       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
52db41cccfSHong Zhang       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
53d94109b8SHong Zhang 
54d94109b8SHong Zhang       n = isz;  /* length of the updated is[i] */
55d94109b8SHong Zhang       for (brow=0; brow<mbs; brow++) {
56d94109b8SHong Zhang         start = ai[brow]; end   = ai[brow+1];
57db41cccfSHong Zhang         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
58d94109b8SHong Zhang           for (l = start; l<end; l++) {
59d94109b8SHong Zhang             bcol = aj[l];
60d7b97159SHong Zhang             if (!PetscBTLookupSet(table_out,bcol)) {
61d7b97159SHong Zhang               nidx[isz++] = bcol;
62d7b97159SHong Zhang               if (bcol_max < bcol) bcol_max = bcol;
63d7b97159SHong Zhang             }
64d94109b8SHong Zhang           }
65d94109b8SHong Zhang           k++;
66d94109b8SHong Zhang           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67d94109b8SHong Zhang         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
68d94109b8SHong Zhang           for (l = start; l<end; l++) {
69d94109b8SHong Zhang             bcol = aj[l];
70dbe03f88SHong Zhang             if (bcol > bcol_max) break;
71db41cccfSHong Zhang             if (PetscBTLookup(table_in,bcol)) {
7226fbe8dcSKarl Rupp               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
73d94109b8SHong Zhang               break; /* for l = start; l<end ; l++) */
74d94109b8SHong Zhang             }
75d94109b8SHong Zhang           }
76d94109b8SHong Zhang         }
77d94109b8SHong Zhang       }
78d94109b8SHong Zhang     } /* for each overlap */
79d94109b8SHong Zhang 
80d94109b8SHong Zhang     /* expand the Index Set */
81d94109b8SHong Zhang     for (j=0; j<isz; j++) {
8226fbe8dcSKarl Rupp       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
83d94109b8SHong Zhang     }
8470b3c8c7SBarry Smith     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
85d94109b8SHong Zhang   }
8694bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
87d94109b8SHong Zhang   ierr = PetscFree(nidx);CHKERRQ(ierr);
88d94109b8SHong Zhang   ierr = PetscFree(nidx2);CHKERRQ(ierr);
8994bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
905eee224dSHong Zhang   PetscFunctionReturn(0);
9149b5e25fSSatish Balay }
9249b5e25fSSatish Balay 
93847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
94847daeccSHong Zhang         Zero some ops' to avoid invalid usse */
95847daeccSHong Zhang PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
9649b5e25fSSatish Balay {
976849ba73SBarry Smith   PetscErrorCode ierr;
9849b5e25fSSatish Balay 
9949b5e25fSSatish Balay   PetscFunctionBegin;
100847daeccSHong Zhang   ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr);
101847daeccSHong Zhang   Bseq->ops->mult                   = 0;
102847daeccSHong Zhang   Bseq->ops->multadd                = 0;
103847daeccSHong Zhang   Bseq->ops->multtranspose          = 0;
104847daeccSHong Zhang   Bseq->ops->multtransposeadd       = 0;
105847daeccSHong Zhang   Bseq->ops->lufactor               = 0;
106847daeccSHong Zhang   Bseq->ops->choleskyfactor         = 0;
107847daeccSHong Zhang   Bseq->ops->lufactorsymbolic       = 0;
108847daeccSHong Zhang   Bseq->ops->choleskyfactorsymbolic = 0;
109847daeccSHong Zhang   Bseq->ops->getinertia             = 0;
11049b5e25fSSatish Balay   PetscFunctionReturn(0);
11149b5e25fSSatish Balay }
11249b5e25fSSatish Balay 
1137dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
1147dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
115e50a8114SHong Zhang {
116e50a8114SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*c;
117e50a8114SHong Zhang   PetscErrorCode ierr;
118e50a8114SHong Zhang   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
119e50a8114SHong Zhang   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
120e50a8114SHong Zhang   const PetscInt *irow,*icol;
121e50a8114SHong Zhang   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
122e50a8114SHong Zhang   PetscInt       *aj = a->j,*ai = a->i;
123e50a8114SHong Zhang   MatScalar      *mat_a;
124e50a8114SHong Zhang   Mat            C;
125e50a8114SHong Zhang   PetscBool      flag;
126e50a8114SHong Zhang 
127e50a8114SHong Zhang   PetscFunctionBegin;
128e50a8114SHong Zhang 
129e50a8114SHong Zhang   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
130e50a8114SHong Zhang   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
131e50a8114SHong Zhang   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
132e50a8114SHong Zhang   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
133e50a8114SHong Zhang 
134e50a8114SHong Zhang   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
135e50a8114SHong Zhang   ssmap = smap;
136e50a8114SHong Zhang   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
137e50a8114SHong Zhang   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
138e50a8114SHong Zhang   /* determine lens of each row */
139e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
140e50a8114SHong Zhang     kstart  = ai[irow[i]];
141e50a8114SHong Zhang     kend    = kstart + a->ilen[irow[i]];
142e50a8114SHong Zhang     lens[i] = 0;
143e50a8114SHong Zhang     for (k=kstart; k<kend; k++) {
144e50a8114SHong Zhang       if (ssmap[aj[k]]) lens[i]++;
145e50a8114SHong Zhang     }
146e50a8114SHong Zhang   }
147e50a8114SHong Zhang   /* Create and fill new matrix */
148e50a8114SHong Zhang   if (scall == MAT_REUSE_MATRIX) {
149e50a8114SHong Zhang     c = (Mat_SeqSBAIJ*)((*B)->data);
150e50a8114SHong Zhang 
151e50a8114SHong Zhang     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
152e50a8114SHong Zhang     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
153e50a8114SHong Zhang     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
154e50a8114SHong Zhang     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
155e50a8114SHong Zhang     C    = *B;
156e50a8114SHong Zhang   } else {
157e50a8114SHong Zhang     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
158e50a8114SHong Zhang     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
159e50a8114SHong Zhang     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
160367daffbSBarry Smith     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
161e50a8114SHong Zhang   }
162e50a8114SHong Zhang   c = (Mat_SeqSBAIJ*)(C->data);
163e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
164e50a8114SHong Zhang     row      = irow[i];
165e50a8114SHong Zhang     kstart   = ai[row];
166e50a8114SHong Zhang     kend     = kstart + a->ilen[row];
167e50a8114SHong Zhang     mat_i    = c->i[i];
168e50a8114SHong Zhang     mat_j    = c->j + mat_i;
169e50a8114SHong Zhang     mat_a    = c->a + mat_i*bs2;
170e50a8114SHong Zhang     mat_ilen = c->ilen + i;
171e50a8114SHong Zhang     for (k=kstart; k<kend; k++) {
172e50a8114SHong Zhang       if ((tcol=ssmap[a->j[k]])) {
173e50a8114SHong Zhang         *mat_j++ = tcol - 1;
174e50a8114SHong Zhang         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
175e50a8114SHong Zhang         mat_a   += bs2;
176e50a8114SHong Zhang         (*mat_ilen)++;
177e50a8114SHong Zhang       }
178e50a8114SHong Zhang     }
179e50a8114SHong Zhang   }
180e50a8114SHong Zhang   /* sort */
181e50a8114SHong Zhang   {
182e50a8114SHong Zhang     MatScalar *work;
183e50a8114SHong Zhang 
184e50a8114SHong Zhang     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
185e50a8114SHong Zhang     for (i=0; i<nrows; i++) {
186e50a8114SHong Zhang       PetscInt ilen;
187e50a8114SHong Zhang       mat_i = c->i[i];
188e50a8114SHong Zhang       mat_j = c->j + mat_i;
189e50a8114SHong Zhang       mat_a = c->a + mat_i*bs2;
190e50a8114SHong Zhang       ilen  = c->ilen[i];
191e50a8114SHong Zhang       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
192e50a8114SHong Zhang     }
193e50a8114SHong Zhang     ierr = PetscFree(work);CHKERRQ(ierr);
194e50a8114SHong Zhang   }
195e50a8114SHong Zhang 
196e50a8114SHong Zhang   /* Free work space */
197e50a8114SHong Zhang   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
198e50a8114SHong Zhang   ierr = PetscFree(smap);CHKERRQ(ierr);
199e50a8114SHong Zhang   ierr = PetscFree(lens);CHKERRQ(ierr);
200e50a8114SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
201e50a8114SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
202e50a8114SHong Zhang 
203e50a8114SHong Zhang   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
204e50a8114SHong Zhang   *B   = C;
205e50a8114SHong Zhang   PetscFunctionReturn(0);
206e50a8114SHong Zhang }
207e50a8114SHong Zhang 
2087dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
20949b5e25fSSatish Balay {
210e50a8114SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
211e50a8114SHong Zhang   IS             is1,is2;
2126849ba73SBarry Smith   PetscErrorCode ierr;
213e50a8114SHong Zhang   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
214e50a8114SHong Zhang   const PetscInt *irow,*icol;
21549b5e25fSSatish Balay 
21649b5e25fSSatish Balay   PetscFunctionBegin;
217e50a8114SHong Zhang   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
218e50a8114SHong Zhang   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
219e50a8114SHong Zhang   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
220e50a8114SHong Zhang   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
221e50a8114SHong Zhang 
222e50a8114SHong Zhang   /* Verify if the indices corespond to each element in a block
223e50a8114SHong Zhang    and form the IS with compressed IS */
224e50a8114SHong Zhang   maxmnbs = PetscMax(a->mbs,a->nbs);
225e50a8114SHong Zhang   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
226e50a8114SHong Zhang   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
227e50a8114SHong Zhang   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
228e50a8114SHong Zhang   for (i=0; i<a->mbs; i++) {
229e50a8114SHong Zhang     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
230e50a8114SHong Zhang   }
231e50a8114SHong Zhang   count = 0;
232e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
233e50a8114SHong Zhang     PetscInt j = irow[i] / bs;
234e50a8114SHong Zhang     if ((vary[j]--)==bs) iary[count++] = j;
235e50a8114SHong Zhang   }
236e50a8114SHong Zhang   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
237e50a8114SHong Zhang 
238e50a8114SHong Zhang   ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr);
239e50a8114SHong Zhang   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
240e50a8114SHong Zhang   for (i=0; i<a->nbs; i++) {
241e50a8114SHong Zhang     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
242e50a8114SHong Zhang   }
243e50a8114SHong Zhang   count = 0;
244e50a8114SHong Zhang   for (i=0; i<ncols; i++) {
245e50a8114SHong Zhang     PetscInt j = icol[i] / bs;
246e50a8114SHong Zhang     if ((vary[j]--)==bs) iary[count++] = j;
247e50a8114SHong Zhang   }
248e50a8114SHong Zhang   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
249e50a8114SHong Zhang   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
250e50a8114SHong Zhang   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
251e50a8114SHong Zhang   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
252e50a8114SHong Zhang 
2537dae84e0SHong Zhang   ierr = MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
254e50a8114SHong Zhang   ierr = ISDestroy(&is1);CHKERRQ(ierr);
255e50a8114SHong Zhang   ierr = ISDestroy(&is2);CHKERRQ(ierr);
256847daeccSHong Zhang 
2578f46ffcaSHong Zhang   if (isrow != iscol) {
2588f46ffcaSHong Zhang     PetscBool isequal;
2598f46ffcaSHong Zhang     ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
260847daeccSHong Zhang     if (!isequal) {
261847daeccSHong Zhang       ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr);
2628f46ffcaSHong Zhang     }
26349b5e25fSSatish Balay   }
26449b5e25fSSatish Balay   PetscFunctionReturn(0);
26549b5e25fSSatish Balay }
26649b5e25fSSatish Balay 
2677dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
26849b5e25fSSatish Balay {
2696849ba73SBarry Smith   PetscErrorCode ierr;
27013f74950SBarry Smith   PetscInt       i;
27149b5e25fSSatish Balay 
27249b5e25fSSatish Balay   PetscFunctionBegin;
273e50a8114SHong Zhang   if (scall == MAT_INITIAL_MATRIX) {
274df750dc8SHong Zhang     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
275afebec48SHong Zhang   }
276e50a8114SHong Zhang 
277e50a8114SHong Zhang   for (i=0; i<n; i++) {
2787dae84e0SHong Zhang     ierr = MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
27949b5e25fSSatish Balay   }
28049b5e25fSSatish Balay   PetscFunctionReturn(0);
28149b5e25fSSatish Balay }
28249b5e25fSSatish Balay 
28349b5e25fSSatish Balay /* -------------------------------------------------------*/
28449b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */
28549b5e25fSSatish Balay /* -------------------------------------------------------*/
28649b5e25fSSatish Balay 
287dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
28849b5e25fSSatish Balay {
28949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
290d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,zero=0.0;
291d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
292d9ca1df4SBarry Smith   const MatScalar   *v;
2936849ba73SBarry Smith   PetscErrorCode    ierr;
294d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
295d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
29698c9bda7SSatish Balay   PetscInt          nonzerorow=0;
29749b5e25fSSatish Balay 
29849b5e25fSSatish Balay   PetscFunctionBegin;
2992dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
300d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3011ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
30249b5e25fSSatish Balay 
30349b5e25fSSatish Balay   v  = a->a;
30449b5e25fSSatish Balay   xb = x;
30549b5e25fSSatish Balay 
30649b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
30749b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
30849b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
30949b5e25fSSatish Balay     ib          = aj + *ai;
310831a3094SHong Zhang     jmin        = 0;
31198c9bda7SSatish Balay     nonzerorow += (n>0);
3127fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
31349b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
31449b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
315831a3094SHong Zhang       v        += 4; jmin++;
3167fbae186SHong Zhang     }
317444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
318444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
319831a3094SHong Zhang     for (j=jmin; j<n; j++) {
32049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
32149b5e25fSSatish Balay       cval       = ib[j]*2;
32249b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
32349b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
32449b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
32549b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
32649b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
32749b5e25fSSatish Balay       v        += 4;
32849b5e25fSSatish Balay     }
32949b5e25fSSatish Balay     xb +=2; ai++;
33049b5e25fSSatish Balay   }
33149b5e25fSSatish Balay 
332d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
3331ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
334dc0b31edSSatish Balay   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
33549b5e25fSSatish Balay   PetscFunctionReturn(0);
33649b5e25fSSatish Balay }
33749b5e25fSSatish Balay 
338dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
33949b5e25fSSatish Balay {
34049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
341d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,zero=0.0;
342d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
343d9ca1df4SBarry Smith   const MatScalar   *v;
3446849ba73SBarry Smith   PetscErrorCode    ierr;
345d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
346d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
34798c9bda7SSatish Balay   PetscInt          nonzerorow=0;
34849b5e25fSSatish Balay 
34949b5e25fSSatish Balay   PetscFunctionBegin;
3502dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
351d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3521ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
35349b5e25fSSatish Balay 
35449b5e25fSSatish Balay   v  = a->a;
35549b5e25fSSatish Balay   xb = x;
35649b5e25fSSatish Balay 
35749b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
35849b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
35949b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
36049b5e25fSSatish Balay     ib          = aj + *ai;
361831a3094SHong Zhang     jmin        = 0;
36298c9bda7SSatish Balay     nonzerorow += (n>0);
3637fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
36449b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
36549b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
36649b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
367831a3094SHong Zhang       v        += 9; jmin++;
3687fbae186SHong Zhang     }
369444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
370444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
371831a3094SHong Zhang     for (j=jmin; j<n; j++) {
37249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
37349b5e25fSSatish Balay       cval       = ib[j]*3;
37449b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
37549b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
37649b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
37749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
37849b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
37949b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
38049b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
38149b5e25fSSatish Balay       v        += 9;
38249b5e25fSSatish Balay     }
38349b5e25fSSatish Balay     xb +=3; ai++;
38449b5e25fSSatish Balay   }
38549b5e25fSSatish Balay 
386d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
3871ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
388dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
38949b5e25fSSatish Balay   PetscFunctionReturn(0);
39049b5e25fSSatish Balay }
39149b5e25fSSatish Balay 
392dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
39349b5e25fSSatish Balay {
39449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
395d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
396d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
397d9ca1df4SBarry Smith   const MatScalar   *v;
3986849ba73SBarry Smith   PetscErrorCode    ierr;
399d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
400d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
40198c9bda7SSatish Balay   PetscInt          nonzerorow = 0;
40249b5e25fSSatish Balay 
40349b5e25fSSatish Balay   PetscFunctionBegin;
4042dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
405d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4061ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
40749b5e25fSSatish Balay 
40849b5e25fSSatish Balay   v  = a->a;
40949b5e25fSSatish Balay   xb = x;
41049b5e25fSSatish Balay 
41149b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
41249b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
41349b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
41449b5e25fSSatish Balay     ib          = aj + *ai;
415831a3094SHong Zhang     jmin        = 0;
41698c9bda7SSatish Balay     nonzerorow += (n>0);
4177fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
41849b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
41949b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
42049b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
42149b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
422831a3094SHong Zhang       v        += 16; jmin++;
4237fbae186SHong Zhang     }
424444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
425444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
426831a3094SHong Zhang     for (j=jmin; j<n; j++) {
42749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
42849b5e25fSSatish Balay       cval       = ib[j]*4;
42949b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
43049b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
43149b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
43249b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
43349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
43449b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
43549b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
43649b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
43749b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
43849b5e25fSSatish Balay       v        += 16;
43949b5e25fSSatish Balay     }
44049b5e25fSSatish Balay     xb +=4; ai++;
44149b5e25fSSatish Balay   }
44249b5e25fSSatish Balay 
443d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
4441ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
445dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
44649b5e25fSSatish Balay   PetscFunctionReturn(0);
44749b5e25fSSatish Balay }
44849b5e25fSSatish Balay 
449dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
45049b5e25fSSatish Balay {
45149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
452d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
453d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
454d9ca1df4SBarry Smith   const MatScalar   *v;
4556849ba73SBarry Smith   PetscErrorCode    ierr;
456d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
457d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
45898c9bda7SSatish Balay   PetscInt          nonzerorow=0;
45949b5e25fSSatish Balay 
46049b5e25fSSatish Balay   PetscFunctionBegin;
4612dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
462d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4631ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
46449b5e25fSSatish Balay 
46549b5e25fSSatish Balay   v  = a->a;
46649b5e25fSSatish Balay   xb = x;
46749b5e25fSSatish Balay 
46849b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
46949b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
47049b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
47149b5e25fSSatish Balay     ib          = aj + *ai;
472831a3094SHong Zhang     jmin        = 0;
47398c9bda7SSatish Balay     nonzerorow += (n>0);
4747fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
47549b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
47649b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
47749b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
47849b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
47949b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
480831a3094SHong Zhang       v        += 25; jmin++;
4817fbae186SHong Zhang     }
482444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
483444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
484831a3094SHong Zhang     for (j=jmin; j<n; j++) {
48549b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
48649b5e25fSSatish Balay       cval       = ib[j]*5;
48749b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
48849b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
48949b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
49049b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
49149b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
49249b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
49349b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
49449b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
49549b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
49649b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
49749b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
49849b5e25fSSatish Balay       v        += 25;
49949b5e25fSSatish Balay     }
50049b5e25fSSatish Balay     xb +=5; ai++;
50149b5e25fSSatish Balay   }
50249b5e25fSSatish Balay 
503d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5041ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
505dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
50649b5e25fSSatish Balay   PetscFunctionReturn(0);
50749b5e25fSSatish Balay }
50849b5e25fSSatish Balay 
50949b5e25fSSatish Balay 
510dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
51149b5e25fSSatish Balay {
51249b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
513d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
514d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
515d9ca1df4SBarry Smith   const MatScalar   *v;
5166849ba73SBarry Smith   PetscErrorCode    ierr;
517d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
518d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
51998c9bda7SSatish Balay   PetscInt          nonzerorow=0;
52049b5e25fSSatish Balay 
52149b5e25fSSatish Balay   PetscFunctionBegin;
5222dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
523d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
5241ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
52549b5e25fSSatish Balay 
52649b5e25fSSatish Balay   v  = a->a;
52749b5e25fSSatish Balay   xb = x;
52849b5e25fSSatish Balay 
52949b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
53049b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
53149b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
53249b5e25fSSatish Balay     ib          = aj + *ai;
533831a3094SHong Zhang     jmin        = 0;
53498c9bda7SSatish Balay     nonzerorow += (n>0);
5357fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
53649b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
53749b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
53849b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
53949b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
54049b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
54149b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
542831a3094SHong Zhang       v        += 36; jmin++;
5437fbae186SHong Zhang     }
544444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
545444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
546831a3094SHong Zhang     for (j=jmin; j<n; j++) {
54749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
54849b5e25fSSatish Balay       cval       = ib[j]*6;
54949b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
55049b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
55149b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
55249b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
55349b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
55449b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
55549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
55649b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
55749b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
55849b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
55949b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
56049b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
56149b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
56249b5e25fSSatish Balay       v        += 36;
56349b5e25fSSatish Balay     }
56449b5e25fSSatish Balay     xb +=6; ai++;
56549b5e25fSSatish Balay   }
56649b5e25fSSatish Balay 
567d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5681ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
569dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
57049b5e25fSSatish Balay   PetscFunctionReturn(0);
57149b5e25fSSatish Balay }
572dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
57349b5e25fSSatish Balay {
57449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
575d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
576d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
577d9ca1df4SBarry Smith   const MatScalar   *v;
5786849ba73SBarry Smith   PetscErrorCode    ierr;
579d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
580d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
58198c9bda7SSatish Balay   PetscInt          nonzerorow=0;
58249b5e25fSSatish Balay 
58349b5e25fSSatish Balay   PetscFunctionBegin;
5842dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
585d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
5861ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
58749b5e25fSSatish Balay 
58849b5e25fSSatish Balay   v  = a->a;
58949b5e25fSSatish Balay   xb = x;
59049b5e25fSSatish Balay 
59149b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
59249b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
59349b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
59449b5e25fSSatish Balay     ib          = aj + *ai;
595831a3094SHong Zhang     jmin        = 0;
59698c9bda7SSatish Balay     nonzerorow += (n>0);
5977fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
59849b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
59949b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
60049b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
60149b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
60249b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
60349b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
60449b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
605831a3094SHong Zhang       v        += 49; jmin++;
6067fbae186SHong Zhang     }
607444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
608444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
609831a3094SHong Zhang     for (j=jmin; j<n; j++) {
61049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
61149b5e25fSSatish Balay       cval       = ib[j]*7;
61249b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
61349b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
61449b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
61549b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
61649b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
61749b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
61849b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
61949b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
62049b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
62149b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
62249b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
62349b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
62449b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
62549b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
62649b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
62749b5e25fSSatish Balay       v       += 49;
62849b5e25fSSatish Balay     }
62949b5e25fSSatish Balay     xb +=7; ai++;
63049b5e25fSSatish Balay   }
631d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
6321ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
633dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
63449b5e25fSSatish Balay   PetscFunctionReturn(0);
63549b5e25fSSatish Balay }
63649b5e25fSSatish Balay 
63749b5e25fSSatish Balay /*
63849b5e25fSSatish Balay     This will not work with MatScalar == float because it calls the BLAS
63949b5e25fSSatish Balay */
640dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
64149b5e25fSSatish Balay {
64249b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
643d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
644d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
645d9ca1df4SBarry Smith   const MatScalar   *v;
646dfbe8321SBarry Smith   PetscErrorCode    ierr;
647d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
648d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
64998c9bda7SSatish Balay   PetscInt          nonzerorow=0;
65049b5e25fSSatish Balay 
65149b5e25fSSatish Balay   PetscFunctionBegin;
6522dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
653d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x;
6541ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
65549b5e25fSSatish Balay 
65649b5e25fSSatish Balay   aj = a->j;
65749b5e25fSSatish Balay   v  = a->a;
65849b5e25fSSatish Balay   ii = a->i;
65949b5e25fSSatish Balay 
66049b5e25fSSatish Balay   if (!a->mult_work) {
661854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
66249b5e25fSSatish Balay   }
66349b5e25fSSatish Balay   work = a->mult_work;
66449b5e25fSSatish Balay 
66549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
66649b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
66749b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
66898c9bda7SSatish Balay     nonzerorow += (n>0);
66949b5e25fSSatish Balay 
67049b5e25fSSatish Balay     /* upper triangular part */
67149b5e25fSSatish Balay     for (j=0; j<n; j++) {
67249b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
67349b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
67449b5e25fSSatish Balay       workt += bs;
67549b5e25fSSatish Balay     }
67649b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
67796b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
67849b5e25fSSatish Balay 
67949b5e25fSSatish Balay     /* strict lower triangular part */
680831a3094SHong Zhang     idx = aj+ii[0];
681831a3094SHong Zhang     if (*idx == i) {
68296b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
683831a3094SHong Zhang     }
68496b9376eSHong Zhang 
68549b5e25fSSatish Balay     if (ncols > 0) {
68649b5e25fSSatish Balay       workt = work;
68787828ca2SBarry Smith       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
68896b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
689831a3094SHong Zhang       for (j=0; j<n; j++) {
690831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
69149b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
69249b5e25fSSatish Balay         workt += bs;
69349b5e25fSSatish Balay       }
69449b5e25fSSatish Balay     }
69549b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
69649b5e25fSSatish Balay   }
69749b5e25fSSatish Balay 
698d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
6991ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
700dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
70149b5e25fSSatish Balay   PetscFunctionReturn(0);
70249b5e25fSSatish Balay }
70349b5e25fSSatish Balay 
704dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
70549b5e25fSSatish Balay {
70649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
707d9ca1df4SBarry Smith   PetscScalar       *z,x1;
708d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
709d9ca1df4SBarry Smith   const MatScalar   *v;
7106849ba73SBarry Smith   PetscErrorCode    ierr;
711d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
712d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
71398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
71449b5e25fSSatish Balay 
71549b5e25fSSatish Balay   PetscFunctionBegin;
716343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
717d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7181ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
71949b5e25fSSatish Balay   v    = a->a;
72049b5e25fSSatish Balay   xb   = x;
72149b5e25fSSatish Balay 
72249b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
72349b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th row of A */
72449b5e25fSSatish Balay     x1          = xb[0];
72549b5e25fSSatish Balay     ib          = aj + *ai;
726831a3094SHong Zhang     jmin        = 0;
72798c9bda7SSatish Balay     nonzerorow += (n>0);
728831a3094SHong Zhang     if (*ib == i) {            /* (diag of A)*x */
729831a3094SHong Zhang       z[i] += *v++ * x[*ib++]; jmin++;
730831a3094SHong Zhang     }
731831a3094SHong Zhang     for (j=jmin; j<n; j++) {
73249b5e25fSSatish Balay       cval    = *ib;
73349b5e25fSSatish Balay       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
73449b5e25fSSatish Balay       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
73549b5e25fSSatish Balay     }
73649b5e25fSSatish Balay     xb++; ai++;
73749b5e25fSSatish Balay   }
73849b5e25fSSatish Balay 
739d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
740bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
74149b5e25fSSatish Balay 
742dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
74349b5e25fSSatish Balay   PetscFunctionReturn(0);
74449b5e25fSSatish Balay }
74549b5e25fSSatish Balay 
746dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
74749b5e25fSSatish Balay {
74849b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
749d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2;
750d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
751d9ca1df4SBarry Smith   const MatScalar   *v;
7526849ba73SBarry Smith   PetscErrorCode    ierr;
753d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
754d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
75598c9bda7SSatish Balay   PetscInt          nonzerorow=0;
75649b5e25fSSatish Balay 
75749b5e25fSSatish Balay   PetscFunctionBegin;
758343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
759d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7601ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
76149b5e25fSSatish Balay 
76249b5e25fSSatish Balay   v  = a->a;
76349b5e25fSSatish Balay   xb = x;
76449b5e25fSSatish Balay 
76549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
76649b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
76749b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
76849b5e25fSSatish Balay     ib          = aj + *ai;
769831a3094SHong Zhang     jmin        = 0;
77098c9bda7SSatish Balay     nonzerorow += (n>0);
7717fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
77249b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
77349b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
774831a3094SHong Zhang       v        += 4; jmin++;
7757fbae186SHong Zhang     }
776444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
777444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
778831a3094SHong Zhang     for (j=jmin; j<n; j++) {
77949b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
78049b5e25fSSatish Balay       cval       = ib[j]*2;
78149b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
78249b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
78349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
78449b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
78549b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
78649b5e25fSSatish Balay       v        += 4;
78749b5e25fSSatish Balay     }
78849b5e25fSSatish Balay     xb +=2; ai++;
78949b5e25fSSatish Balay   }
790d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
791bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
79249b5e25fSSatish Balay 
793dc0b31edSSatish Balay   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
79449b5e25fSSatish Balay   PetscFunctionReturn(0);
79549b5e25fSSatish Balay }
79649b5e25fSSatish Balay 
797dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
79849b5e25fSSatish Balay {
79949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
800d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3;
801d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
802d9ca1df4SBarry Smith   const MatScalar   *v;
8036849ba73SBarry Smith   PetscErrorCode    ierr;
804d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
805d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
80698c9bda7SSatish Balay   PetscInt          nonzerorow=0;
80749b5e25fSSatish Balay 
80849b5e25fSSatish Balay   PetscFunctionBegin;
809343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
810d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8111ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
81249b5e25fSSatish Balay 
81349b5e25fSSatish Balay   v  = a->a;
81449b5e25fSSatish Balay   xb = x;
81549b5e25fSSatish Balay 
81649b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
81749b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
81849b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
81949b5e25fSSatish Balay     ib          = aj + *ai;
820831a3094SHong Zhang     jmin        = 0;
82198c9bda7SSatish Balay     nonzerorow += (n>0);
8227fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
82349b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
82449b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
82549b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
826831a3094SHong Zhang       v        += 9; jmin++;
8277fbae186SHong Zhang     }
828444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
829444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
830831a3094SHong Zhang     for (j=jmin; j<n; j++) {
83149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
83249b5e25fSSatish Balay       cval       = ib[j]*3;
83349b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
83449b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
83549b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
83649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
83749b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
83849b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
83949b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
84049b5e25fSSatish Balay       v        += 9;
84149b5e25fSSatish Balay     }
84249b5e25fSSatish Balay     xb +=3; ai++;
84349b5e25fSSatish Balay   }
84449b5e25fSSatish Balay 
845d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
846bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
84749b5e25fSSatish Balay 
848dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
84949b5e25fSSatish Balay   PetscFunctionReturn(0);
85049b5e25fSSatish Balay }
85149b5e25fSSatish Balay 
852dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
85349b5e25fSSatish Balay {
85449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
855d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4;
856d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
857d9ca1df4SBarry Smith   const MatScalar   *v;
8586849ba73SBarry Smith   PetscErrorCode    ierr;
859d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
860d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
86198c9bda7SSatish Balay   PetscInt          nonzerorow=0;
86249b5e25fSSatish Balay 
86349b5e25fSSatish Balay   PetscFunctionBegin;
864343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
865d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8661ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
86749b5e25fSSatish Balay 
86849b5e25fSSatish Balay   v  = a->a;
86949b5e25fSSatish Balay   xb = x;
87049b5e25fSSatish Balay 
87149b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
87249b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
87349b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
87449b5e25fSSatish Balay     ib          = aj + *ai;
875831a3094SHong Zhang     jmin        = 0;
87698c9bda7SSatish Balay     nonzerorow += (n>0);
8777fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
87849b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
87949b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
88049b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
88149b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
882831a3094SHong Zhang       v        += 16; jmin++;
8837fbae186SHong Zhang     }
884444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
885444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
886831a3094SHong Zhang     for (j=jmin; j<n; j++) {
88749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
88849b5e25fSSatish Balay       cval       = ib[j]*4;
88949b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
89049b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
89149b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
89249b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
89349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
89449b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
89549b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
89649b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
89749b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
89849b5e25fSSatish Balay       v        += 16;
89949b5e25fSSatish Balay     }
90049b5e25fSSatish Balay     xb +=4; ai++;
90149b5e25fSSatish Balay   }
90249b5e25fSSatish Balay 
903d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
904bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
90549b5e25fSSatish Balay 
906dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
90749b5e25fSSatish Balay   PetscFunctionReturn(0);
90849b5e25fSSatish Balay }
90949b5e25fSSatish Balay 
910dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
91149b5e25fSSatish Balay {
91249b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
913d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5;
914d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
915d9ca1df4SBarry Smith   const MatScalar   *v;
9166849ba73SBarry Smith   PetscErrorCode    ierr;
917d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
918d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
91998c9bda7SSatish Balay   PetscInt          nonzerorow=0;
92049b5e25fSSatish Balay 
92149b5e25fSSatish Balay   PetscFunctionBegin;
922343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
923d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
9241ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
92549b5e25fSSatish Balay 
92649b5e25fSSatish Balay   v  = a->a;
92749b5e25fSSatish Balay   xb = x;
92849b5e25fSSatish Balay 
92949b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
93049b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
93149b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
93249b5e25fSSatish Balay     ib          = aj + *ai;
933831a3094SHong Zhang     jmin        = 0;
93498c9bda7SSatish Balay     nonzerorow += (n>0);
9357fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
93649b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
93749b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
93849b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
93949b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
94049b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
941831a3094SHong Zhang       v        += 25; jmin++;
9427fbae186SHong Zhang     }
943444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
944444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
945831a3094SHong Zhang     for (j=jmin; j<n; j++) {
94649b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
94749b5e25fSSatish Balay       cval       = ib[j]*5;
94849b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
94949b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
95049b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
95149b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
95249b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
95349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
95449b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
95549b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
95649b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
95749b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
95849b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
95949b5e25fSSatish Balay       v        += 25;
96049b5e25fSSatish Balay     }
96149b5e25fSSatish Balay     xb +=5; ai++;
96249b5e25fSSatish Balay   }
96349b5e25fSSatish Balay 
964d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
965bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
96649b5e25fSSatish Balay 
967dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
96849b5e25fSSatish Balay   PetscFunctionReturn(0);
96949b5e25fSSatish Balay }
970dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
97149b5e25fSSatish Balay {
97249b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
973d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
974d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
975d9ca1df4SBarry Smith   const MatScalar   *v;
9766849ba73SBarry Smith   PetscErrorCode    ierr;
977d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
978d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
97998c9bda7SSatish Balay   PetscInt          nonzerorow=0;
98049b5e25fSSatish Balay 
98149b5e25fSSatish Balay   PetscFunctionBegin;
982343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
983d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
9841ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
98549b5e25fSSatish Balay 
98649b5e25fSSatish Balay   v  = a->a;
98749b5e25fSSatish Balay   xb = x;
98849b5e25fSSatish Balay 
98949b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
99049b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
99149b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
99249b5e25fSSatish Balay     ib          = aj + *ai;
993831a3094SHong Zhang     jmin        = 0;
99498c9bda7SSatish Balay     nonzerorow += (n>0);
9957fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
99649b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
99749b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
99849b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
99949b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
100049b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
100149b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1002831a3094SHong Zhang       v        += 36; jmin++;
10037fbae186SHong Zhang     }
1004444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1005444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1006831a3094SHong Zhang     for (j=jmin; j<n; j++) {
100749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
100849b5e25fSSatish Balay       cval       = ib[j]*6;
100949b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
101049b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
101149b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
101249b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
101349b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
101449b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
101549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
101649b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
101749b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
101849b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
101949b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
102049b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
102149b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
102249b5e25fSSatish Balay       v        += 36;
102349b5e25fSSatish Balay     }
102449b5e25fSSatish Balay     xb +=6; ai++;
102549b5e25fSSatish Balay   }
102649b5e25fSSatish Balay 
1027d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1028bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
102949b5e25fSSatish Balay 
1030dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
103149b5e25fSSatish Balay   PetscFunctionReturn(0);
103249b5e25fSSatish Balay }
103349b5e25fSSatish Balay 
1034dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
103549b5e25fSSatish Balay {
103649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1037d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1038d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
1039d9ca1df4SBarry Smith   const MatScalar   *v;
10406849ba73SBarry Smith   PetscErrorCode    ierr;
1041d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1042d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
104398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
104449b5e25fSSatish Balay 
104549b5e25fSSatish Balay   PetscFunctionBegin;
1046343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1047d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
10481ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
104949b5e25fSSatish Balay 
105049b5e25fSSatish Balay   v  = a->a;
105149b5e25fSSatish Balay   xb = x;
105249b5e25fSSatish Balay 
105349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
105449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
105549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
105649b5e25fSSatish Balay     ib          = aj + *ai;
1057831a3094SHong Zhang     jmin        = 0;
105898c9bda7SSatish Balay     nonzerorow += (n>0);
10597fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
106049b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
106149b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
106249b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
106349b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
106449b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
106549b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
106649b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1067831a3094SHong Zhang       v        += 49; jmin++;
10687fbae186SHong Zhang     }
1069444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1070444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1071831a3094SHong Zhang     for (j=jmin; j<n; j++) {
107249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
107349b5e25fSSatish Balay       cval       = ib[j]*7;
107449b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
107549b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
107649b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
107749b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
107849b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
107949b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
108049b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
108149b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
108249b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
108349b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
108449b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
108549b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
108649b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
108749b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
108849b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
108949b5e25fSSatish Balay       v       += 49;
109049b5e25fSSatish Balay     }
109149b5e25fSSatish Balay     xb +=7; ai++;
109249b5e25fSSatish Balay   }
109349b5e25fSSatish Balay 
1094d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1095bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
109649b5e25fSSatish Balay 
1097dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
109849b5e25fSSatish Balay   PetscFunctionReturn(0);
109949b5e25fSSatish Balay }
110049b5e25fSSatish Balay 
1101dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
110249b5e25fSSatish Balay {
110349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1104d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1105d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
1106d9ca1df4SBarry Smith   const MatScalar   *v;
1107dfbe8321SBarry Smith   PetscErrorCode    ierr;
1108d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1109d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
111098c9bda7SSatish Balay   PetscInt          nonzerorow=0;
111149b5e25fSSatish Balay 
111249b5e25fSSatish Balay   PetscFunctionBegin;
1113343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1114d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
11151ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
111649b5e25fSSatish Balay 
111749b5e25fSSatish Balay   aj = a->j;
111849b5e25fSSatish Balay   v  = a->a;
111949b5e25fSSatish Balay   ii = a->i;
112049b5e25fSSatish Balay 
112149b5e25fSSatish Balay   if (!a->mult_work) {
1122854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
112349b5e25fSSatish Balay   }
112449b5e25fSSatish Balay   work = a->mult_work;
112549b5e25fSSatish Balay 
112649b5e25fSSatish Balay 
112749b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
112849b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
112949b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
113098c9bda7SSatish Balay     nonzerorow += (n>0);
113149b5e25fSSatish Balay 
113249b5e25fSSatish Balay     /* upper triangular part */
113349b5e25fSSatish Balay     for (j=0; j<n; j++) {
113449b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
113549b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
113649b5e25fSSatish Balay       workt += bs;
113749b5e25fSSatish Balay     }
113849b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
113996b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
114049b5e25fSSatish Balay 
114149b5e25fSSatish Balay     /* strict lower triangular part */
1142831a3094SHong Zhang     idx = aj+ii[0];
1143831a3094SHong Zhang     if (*idx == i) {
114496b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
1145831a3094SHong Zhang     }
114649b5e25fSSatish Balay     if (ncols > 0) {
114749b5e25fSSatish Balay       workt = work;
114887828ca2SBarry Smith       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
114996b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1150831a3094SHong Zhang       for (j=0; j<n; j++) {
1151831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
115249b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
115349b5e25fSSatish Balay         workt += bs;
115449b5e25fSSatish Balay       }
115549b5e25fSSatish Balay     }
115649b5e25fSSatish Balay 
115749b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
115849b5e25fSSatish Balay   }
115949b5e25fSSatish Balay 
1160d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1161bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
116249b5e25fSSatish Balay 
1163dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
116449b5e25fSSatish Balay   PetscFunctionReturn(0);
116549b5e25fSSatish Balay }
116649b5e25fSSatish Balay 
1167f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
116849b5e25fSSatish Balay {
116949b5e25fSSatish Balay   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1170f4df32b1SMatthew Knepley   PetscScalar    oalpha = alpha;
1171efee365bSSatish Balay   PetscErrorCode ierr;
1172c5df96a5SBarry Smith   PetscBLASInt   one = 1,totalnz;
117349b5e25fSSatish Balay 
117449b5e25fSSatish Balay   PetscFunctionBegin;
1175c5df96a5SBarry Smith   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
11768b83055fSJed Brown   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1177efee365bSSatish Balay   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
117849b5e25fSSatish Balay   PetscFunctionReturn(0);
117949b5e25fSSatish Balay }
118049b5e25fSSatish Balay 
1181dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
118249b5e25fSSatish Balay {
118349b5e25fSSatish Balay   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1184d9ca1df4SBarry Smith   const MatScalar *v       = a->a;
118549b5e25fSSatish Balay   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1186d9ca1df4SBarry Smith   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
11876849ba73SBarry Smith   PetscErrorCode  ierr;
1188d9ca1df4SBarry Smith   const PetscInt  *aj=a->j,*col;
118949b5e25fSSatish Balay 
119049b5e25fSSatish Balay   PetscFunctionBegin;
119149b5e25fSSatish Balay   if (type == NORM_FROBENIUS) {
119249b5e25fSSatish Balay     for (k=0; k<mbs; k++) {
119349b5e25fSSatish Balay       jmin = a->i[k]; jmax = a->i[k+1];
1194831a3094SHong Zhang       col  = aj + jmin;
1195831a3094SHong Zhang       if (*col == k) {         /* diagonal block */
119649b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
119749b5e25fSSatish Balay           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
119849b5e25fSSatish Balay         }
1199831a3094SHong Zhang         jmin++;
1200831a3094SHong Zhang       }
1201831a3094SHong Zhang       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
120249b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
120349b5e25fSSatish Balay           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
120449b5e25fSSatish Balay         }
120549b5e25fSSatish Balay       }
120649b5e25fSSatish Balay     }
12078f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
120851f70360SJed Brown     ierr = PetscLogFlops(2*bs2*a->nz);CHKERRQ(ierr);
12090b8dc8d2SHong Zhang   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1210dcca6d9dSJed Brown     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
12110b8dc8d2SHong Zhang     for (i=0; i<mbs; i++) jl[i] = mbs;
12120b8dc8d2SHong Zhang     il[0] = 0;
121349b5e25fSSatish Balay 
121449b5e25fSSatish Balay     *norm = 0.0;
121549b5e25fSSatish Balay     for (k=0; k<mbs; k++) { /* k_th block row */
121649b5e25fSSatish Balay       for (j=0; j<bs; j++) sum[j]=0.0;
121749b5e25fSSatish Balay       /*-- col sum --*/
121849b5e25fSSatish Balay       i = jl[k]; /* first |A(i,k)| to be added */
1219831a3094SHong Zhang       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1220831a3094SHong Zhang                   at step k */
122149b5e25fSSatish Balay       while (i<mbs) {
122249b5e25fSSatish Balay         nexti = jl[i];  /* next block row to be added */
122349b5e25fSSatish Balay         ik    = il[i];  /* block index of A(i,k) in the array a */
122449b5e25fSSatish Balay         for (j=0; j<bs; j++) {
122549b5e25fSSatish Balay           v = a->a + ik*bs2 + j*bs;
122649b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
122749b5e25fSSatish Balay             sum[j] += PetscAbsScalar(*v); v++;
122849b5e25fSSatish Balay           }
122949b5e25fSSatish Balay         }
123049b5e25fSSatish Balay         /* update il, jl */
1231831a3094SHong Zhang         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1232831a3094SHong Zhang         jmax = a->i[i+1];
123349b5e25fSSatish Balay         if (jmin < jmax) {
123449b5e25fSSatish Balay           il[i] = jmin;
123549b5e25fSSatish Balay           j     = a->j[jmin];
123649b5e25fSSatish Balay           jl[i] = jl[j]; jl[j]=i;
123749b5e25fSSatish Balay         }
123849b5e25fSSatish Balay         i = nexti;
123949b5e25fSSatish Balay       }
124049b5e25fSSatish Balay       /*-- row sum --*/
124149b5e25fSSatish Balay       jmin = a->i[k]; jmax = a->i[k+1];
124249b5e25fSSatish Balay       for (i=jmin; i<jmax; i++) {
124349b5e25fSSatish Balay         for (j=0; j<bs; j++) {
124449b5e25fSSatish Balay           v = a->a + i*bs2 + j;
124549b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
12460b8dc8d2SHong Zhang             sum[j] += PetscAbsScalar(*v); v += bs;
124749b5e25fSSatish Balay           }
124849b5e25fSSatish Balay         }
124949b5e25fSSatish Balay       }
125049b5e25fSSatish Balay       /* add k_th block row to il, jl */
1251831a3094SHong Zhang       col = aj+jmin;
1252831a3094SHong Zhang       if (*col == k) jmin++;
125349b5e25fSSatish Balay       if (jmin < jmax) {
125449b5e25fSSatish Balay         il[k] = jmin;
12550b8dc8d2SHong Zhang         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
125649b5e25fSSatish Balay       }
125749b5e25fSSatish Balay       for (j=0; j<bs; j++) {
125849b5e25fSSatish Balay         if (sum[j] > *norm) *norm = sum[j];
125949b5e25fSSatish Balay       }
126049b5e25fSSatish Balay     }
126174ed9c26SBarry Smith     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
126251f70360SJed Brown     ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr);
1263f23aa3ddSBarry Smith   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
126449b5e25fSSatish Balay   PetscFunctionReturn(0);
126549b5e25fSSatish Balay }
126649b5e25fSSatish Balay 
1267ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
126849b5e25fSSatish Balay {
126949b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1270dfbe8321SBarry Smith   PetscErrorCode ierr;
127149b5e25fSSatish Balay 
127249b5e25fSSatish Balay   PetscFunctionBegin;
127349b5e25fSSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1274d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1275ef511fbeSHong Zhang     *flg = PETSC_FALSE;
1276ef511fbeSHong Zhang     PetscFunctionReturn(0);
127749b5e25fSSatish Balay   }
127849b5e25fSSatish Balay 
127949b5e25fSSatish Balay   /* if the a->i are the same */
128013f74950SBarry Smith   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
128126fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
128249b5e25fSSatish Balay 
128349b5e25fSSatish Balay   /* if a->j are the same */
128413f74950SBarry Smith   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
128526fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
128626fbe8dcSKarl Rupp 
128749b5e25fSSatish Balay   /* if a->a are the same */
1288d0f46423SBarry Smith   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1289935af2e7SHong Zhang   PetscFunctionReturn(0);
129049b5e25fSSatish Balay }
129149b5e25fSSatish Balay 
1292dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
129349b5e25fSSatish Balay {
129449b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1295dfbe8321SBarry Smith   PetscErrorCode  ierr;
1296d9ca1df4SBarry Smith   PetscInt        i,j,k,row,bs,ambs,bs2;
1297d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
129887828ca2SBarry Smith   PetscScalar     *x,zero = 0.0;
1299d9ca1df4SBarry Smith   const MatScalar *aa,*aa_j;
130049b5e25fSSatish Balay 
130149b5e25fSSatish Balay   PetscFunctionBegin;
1302d0f46423SBarry Smith   bs = A->rmap->bs;
1303e32f2f54SBarry Smith   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
130482799104SHong Zhang 
130549b5e25fSSatish Balay   aa   = a->a;
13068a0c6efdSHong Zhang   ambs = a->mbs;
13078a0c6efdSHong Zhang 
13088a0c6efdSHong Zhang   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
13098a0c6efdSHong Zhang     PetscInt *diag=a->diag;
13108a0c6efdSHong Zhang     aa   = a->a;
13118a0c6efdSHong Zhang     ambs = a->mbs;
13128a0c6efdSHong Zhang     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
13138a0c6efdSHong Zhang     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
13148a0c6efdSHong Zhang     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
13158a0c6efdSHong Zhang     PetscFunctionReturn(0);
13168a0c6efdSHong Zhang   }
13178a0c6efdSHong Zhang 
131849b5e25fSSatish Balay   ai   = a->i;
131949b5e25fSSatish Balay   aj   = a->j;
132049b5e25fSSatish Balay   bs2  = a->bs2;
13212dcb1b2aSMatthew Knepley   ierr = VecSet(v,zero);CHKERRQ(ierr);
13221ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
132349b5e25fSSatish Balay   for (i=0; i<ambs; i++) {
132449b5e25fSSatish Balay     j=ai[i];
132549b5e25fSSatish Balay     if (aj[j] == i) {    /* if this is a diagonal element */
132649b5e25fSSatish Balay       row  = i*bs;
132749b5e25fSSatish Balay       aa_j = aa + j*bs2;
132849b5e25fSSatish Balay       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
132949b5e25fSSatish Balay     }
133049b5e25fSSatish Balay   }
13311ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
133249b5e25fSSatish Balay   PetscFunctionReturn(0);
133349b5e25fSSatish Balay }
133449b5e25fSSatish Balay 
1335dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
133649b5e25fSSatish Balay {
133749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1338d9ca1df4SBarry Smith   PetscScalar       x;
1339d9ca1df4SBarry Smith   const PetscScalar *l,*li,*ri;
134049b5e25fSSatish Balay   MatScalar         *aa,*v;
1341dfbe8321SBarry Smith   PetscErrorCode    ierr;
1342*fff8e43fSBarry Smith   PetscInt          i,j,k,lm,M,m,mbs,tmp,bs,bs2;
1343*fff8e43fSBarry Smith   const PetscInt    *ai,*aj;
1344ace3abfcSBarry Smith   PetscBool         flg;
134549b5e25fSSatish Balay 
134649b5e25fSSatish Balay   PetscFunctionBegin;
1347b3bf805bSHong Zhang   if (ll != rr) {
1348b3bf805bSHong Zhang     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1349e7e72b3dSBarry Smith     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1350b3bf805bSHong Zhang   }
1351b3bf805bSHong Zhang   if (!ll) PetscFunctionReturn(0);
135249b5e25fSSatish Balay   ai  = a->i;
135349b5e25fSSatish Balay   aj  = a->j;
135449b5e25fSSatish Balay   aa  = a->a;
1355d0f46423SBarry Smith   m   = A->rmap->N;
1356d0f46423SBarry Smith   bs  = A->rmap->bs;
135749b5e25fSSatish Balay   mbs = a->mbs;
135849b5e25fSSatish Balay   bs2 = a->bs2;
135949b5e25fSSatish Balay 
1360d9ca1df4SBarry Smith   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
136149b5e25fSSatish Balay   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1362e32f2f54SBarry Smith   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
136349b5e25fSSatish Balay   for (i=0; i<mbs; i++) { /* for each block row */
136449b5e25fSSatish Balay     M  = ai[i+1] - ai[i];
136549b5e25fSSatish Balay     li = l + i*bs;
136649b5e25fSSatish Balay     v  = aa + bs2*ai[i];
136749b5e25fSSatish Balay     for (j=0; j<M; j++) { /* for each block */
136849b5e25fSSatish Balay       ri = l + bs*aj[ai[i]+j];
13695e90f9d9SHong Zhang       for (k=0; k<bs; k++) {
137049b5e25fSSatish Balay         x = ri[k];
137149b5e25fSSatish Balay         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
137249b5e25fSSatish Balay       }
137349b5e25fSSatish Balay     }
137449b5e25fSSatish Balay   }
1375d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1376dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
137749b5e25fSSatish Balay   PetscFunctionReturn(0);
137849b5e25fSSatish Balay }
137949b5e25fSSatish Balay 
1380dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
138149b5e25fSSatish Balay {
138249b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
138349b5e25fSSatish Balay 
138449b5e25fSSatish Balay   PetscFunctionBegin;
138549b5e25fSSatish Balay   info->block_size   = a->bs2;
1386ceed8ce5SJed Brown   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
13876c6c5352SBarry Smith   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
138849b5e25fSSatish Balay   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
138949b5e25fSSatish Balay   info->assemblies   = A->num_ass;
13908e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
13917adad957SLisandro Dalcin   info->memory       = ((PetscObject)A)->mem;
1392d5f3da31SBarry Smith   if (A->factortype) {
139349b5e25fSSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
139449b5e25fSSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
139549b5e25fSSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
139649b5e25fSSatish Balay   } else {
139749b5e25fSSatish Balay     info->fill_ratio_given  = 0;
139849b5e25fSSatish Balay     info->fill_ratio_needed = 0;
139949b5e25fSSatish Balay     info->factor_mallocs    = 0;
140049b5e25fSSatish Balay   }
140149b5e25fSSatish Balay   PetscFunctionReturn(0);
140249b5e25fSSatish Balay }
140349b5e25fSSatish Balay 
140449b5e25fSSatish Balay 
1405dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
140649b5e25fSSatish Balay {
140749b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1408dfbe8321SBarry Smith   PetscErrorCode ierr;
140949b5e25fSSatish Balay 
141049b5e25fSSatish Balay   PetscFunctionBegin;
141149b5e25fSSatish Balay   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
141249b5e25fSSatish Balay   PetscFunctionReturn(0);
141349b5e25fSSatish Balay }
1414dc354874SHong Zhang 
1415985db425SBarry Smith /*
1416985db425SBarry Smith    This code does not work since it only checks the upper triangular part of
1417985db425SBarry Smith   the matrix. Hence it is not listed in the function table.
1418985db425SBarry Smith */
1419985db425SBarry Smith PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1420dc354874SHong Zhang {
1421dc354874SHong Zhang   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1422dfbe8321SBarry Smith   PetscErrorCode  ierr;
1423d9ca1df4SBarry Smith   PetscInt        i,j,n,row,col,bs,mbs;
1424d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
1425c3fca9a7SHong Zhang   PetscReal       atmp;
1426d9ca1df4SBarry Smith   const MatScalar *aa;
1427985db425SBarry Smith   PetscScalar     *x;
142813f74950SBarry Smith   PetscInt        ncols,brow,bcol,krow,kcol;
1429dc354874SHong Zhang 
1430dc354874SHong Zhang   PetscFunctionBegin;
1431e32f2f54SBarry Smith   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1432e32f2f54SBarry Smith   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1433d0f46423SBarry Smith   bs  = A->rmap->bs;
1434dc354874SHong Zhang   aa  = a->a;
1435dc354874SHong Zhang   ai  = a->i;
1436dc354874SHong Zhang   aj  = a->j;
143744117c81SHong Zhang   mbs = a->mbs;
1438dc354874SHong Zhang 
1439985db425SBarry Smith   ierr = VecSet(v,0.0);CHKERRQ(ierr);
14401ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1441dc354874SHong Zhang   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1442e32f2f54SBarry Smith   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
144344117c81SHong Zhang   for (i=0; i<mbs; i++) {
1444d0f6400bSHong Zhang     ncols = ai[1] - ai[0]; ai++;
1445d0f6400bSHong Zhang     brow  = bs*i;
144644117c81SHong Zhang     for (j=0; j<ncols; j++) {
1447d0f6400bSHong Zhang       bcol = bs*(*aj);
144844117c81SHong Zhang       for (kcol=0; kcol<bs; kcol++) {
1449d0f6400bSHong Zhang         col = bcol + kcol;      /* col index */
145044117c81SHong Zhang         for (krow=0; krow<bs; krow++) {
1451d0f6400bSHong Zhang           atmp = PetscAbsScalar(*aa); aa++;
1452d0f6400bSHong Zhang           row  = brow + krow;   /* row index */
1453c3fca9a7SHong Zhang           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1454c3fca9a7SHong Zhang           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
145544117c81SHong Zhang         }
145644117c81SHong Zhang       }
1457d0f6400bSHong Zhang       aj++;
1458dc354874SHong Zhang     }
1459dc354874SHong Zhang   }
14601ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1461dc354874SHong Zhang   PetscFunctionReturn(0);
1462dc354874SHong Zhang }
1463