xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision c40ae8730e1156c97d0dc1f9569c0c972cb9ef2b)
149b5e25fSSatish Balay 
2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
3af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
4c6db04a5SJed Brown #include <petscbt.h>
5c6db04a5SJed Brown #include <../src/mat/impls/sbaij/seq/sbaij.h>
6c6db04a5SJed Brown #include <petscblaslapack.h>
749b5e25fSSatish Balay 
813f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
949b5e25fSSatish Balay {
105eee224dSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
116849ba73SBarry Smith   PetscErrorCode ierr;
125d0c19d7SBarry Smith   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
135d0c19d7SBarry Smith   const PetscInt *idx;
14db41cccfSHong Zhang   PetscBT        table_out,table_in;
15d94109b8SHong Zhang 
16d94109b8SHong Zhang   PetscFunctionBegin;
17e32f2f54SBarry Smith   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
18d94109b8SHong Zhang   mbs  = a->mbs;
19d94109b8SHong Zhang   ai   = a->i;
20d94109b8SHong Zhang   aj   = a->j;
21d0f46423SBarry Smith   bs   = A->rmap->bs;
2253b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
23854ce69bSBarry Smith   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
24854ce69bSBarry Smith   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
2553b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
26d94109b8SHong Zhang 
27d94109b8SHong Zhang   for (i=0; i<is_max; i++) { /* for each is */
28d94109b8SHong Zhang     isz  = 0;
29db41cccfSHong Zhang     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
30d94109b8SHong Zhang 
31d94109b8SHong Zhang     /* Extract the indices, assume there can be duplicate entries */
32d94109b8SHong Zhang     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
33d94109b8SHong Zhang     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
34d94109b8SHong Zhang 
35db41cccfSHong Zhang     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
36dbe03f88SHong Zhang     bcol_max = 0;
37d94109b8SHong Zhang     for (j=0; j<n; ++j) {
38d94109b8SHong Zhang       brow = idx[j]/bs; /* convert the indices into block indices */
39e32f2f54SBarry Smith       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
40db41cccfSHong Zhang       if (!PetscBTLookupSet(table_out,brow)) {
41dbe03f88SHong Zhang         nidx[isz++] = brow;
42dbe03f88SHong Zhang         if (bcol_max < brow) bcol_max = brow;
43dbe03f88SHong Zhang       }
44d94109b8SHong Zhang     }
45d94109b8SHong Zhang     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
466bf464f9SBarry Smith     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
47d94109b8SHong Zhang 
48d94109b8SHong Zhang     k = 0;
49d94109b8SHong Zhang     for (j=0; j<ov; j++) { /* for each overlap */
50db41cccfSHong Zhang       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
51db41cccfSHong Zhang       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
52db41cccfSHong Zhang       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
53d94109b8SHong Zhang 
54d94109b8SHong Zhang       n = isz;  /* length of the updated is[i] */
55d94109b8SHong Zhang       for (brow=0; brow<mbs; brow++) {
56d94109b8SHong Zhang         start = ai[brow]; end   = ai[brow+1];
57db41cccfSHong Zhang         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
58d94109b8SHong Zhang           for (l = start; l<end; l++) {
59d94109b8SHong Zhang             bcol = aj[l];
60d7b97159SHong Zhang             if (!PetscBTLookupSet(table_out,bcol)) {
61d7b97159SHong Zhang               nidx[isz++] = bcol;
62d7b97159SHong Zhang               if (bcol_max < bcol) bcol_max = bcol;
63d7b97159SHong Zhang             }
64d94109b8SHong Zhang           }
65d94109b8SHong Zhang           k++;
66d94109b8SHong Zhang           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67d94109b8SHong Zhang         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
68d94109b8SHong Zhang           for (l = start; l<end; l++) {
69d94109b8SHong Zhang             bcol = aj[l];
70dbe03f88SHong Zhang             if (bcol > bcol_max) break;
71db41cccfSHong Zhang             if (PetscBTLookup(table_in,bcol)) {
7226fbe8dcSKarl Rupp               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
73d94109b8SHong Zhang               break; /* for l = start; l<end ; l++) */
74d94109b8SHong Zhang             }
75d94109b8SHong Zhang           }
76d94109b8SHong Zhang         }
77d94109b8SHong Zhang       }
78d94109b8SHong Zhang     } /* for each overlap */
79d94109b8SHong Zhang 
80d94109b8SHong Zhang     /* expand the Index Set */
81d94109b8SHong Zhang     for (j=0; j<isz; j++) {
8226fbe8dcSKarl Rupp       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
83d94109b8SHong Zhang     }
8470b3c8c7SBarry Smith     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
85d94109b8SHong Zhang   }
8694bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
87d94109b8SHong Zhang   ierr = PetscFree(nidx);CHKERRQ(ierr);
88d94109b8SHong Zhang   ierr = PetscFree(nidx2);CHKERRQ(ierr);
8994bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
905eee224dSHong Zhang   PetscFunctionReturn(0);
9149b5e25fSSatish Balay }
9249b5e25fSSatish Balay 
93847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
94847daeccSHong Zhang         Zero some ops' to avoid invalid usse */
95847daeccSHong Zhang PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
9649b5e25fSSatish Balay {
976849ba73SBarry Smith   PetscErrorCode ierr;
9849b5e25fSSatish Balay 
9949b5e25fSSatish Balay   PetscFunctionBegin;
100847daeccSHong Zhang   ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr);
101847daeccSHong Zhang   Bseq->ops->mult                   = 0;
102847daeccSHong Zhang   Bseq->ops->multadd                = 0;
103847daeccSHong Zhang   Bseq->ops->multtranspose          = 0;
104847daeccSHong Zhang   Bseq->ops->multtransposeadd       = 0;
105847daeccSHong Zhang   Bseq->ops->lufactor               = 0;
106847daeccSHong Zhang   Bseq->ops->choleskyfactor         = 0;
107847daeccSHong Zhang   Bseq->ops->lufactorsymbolic       = 0;
108847daeccSHong Zhang   Bseq->ops->choleskyfactorsymbolic = 0;
109847daeccSHong Zhang   Bseq->ops->getinertia             = 0;
11049b5e25fSSatish Balay   PetscFunctionReturn(0);
11149b5e25fSSatish Balay }
11249b5e25fSSatish Balay 
1137dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
1147dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
115e50a8114SHong Zhang {
116e50a8114SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*c;
117e50a8114SHong Zhang   PetscErrorCode ierr;
118e50a8114SHong Zhang   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
119e50a8114SHong Zhang   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
120e50a8114SHong Zhang   const PetscInt *irow,*icol;
121e50a8114SHong Zhang   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
122e50a8114SHong Zhang   PetscInt       *aj = a->j,*ai = a->i;
123e50a8114SHong Zhang   MatScalar      *mat_a;
124e50a8114SHong Zhang   Mat            C;
125e50a8114SHong Zhang   PetscBool      flag;
126e50a8114SHong Zhang 
127e50a8114SHong Zhang   PetscFunctionBegin;
128e50a8114SHong Zhang 
129e50a8114SHong Zhang   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
130e50a8114SHong Zhang   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
131e50a8114SHong Zhang   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
132e50a8114SHong Zhang   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
133e50a8114SHong Zhang 
134e50a8114SHong Zhang   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
135e50a8114SHong Zhang   ssmap = smap;
136e50a8114SHong Zhang   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
137e50a8114SHong Zhang   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
138e50a8114SHong Zhang   /* determine lens of each row */
139e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
140e50a8114SHong Zhang     kstart  = ai[irow[i]];
141e50a8114SHong Zhang     kend    = kstart + a->ilen[irow[i]];
142e50a8114SHong Zhang     lens[i] = 0;
143e50a8114SHong Zhang     for (k=kstart; k<kend; k++) {
144e50a8114SHong Zhang       if (ssmap[aj[k]]) lens[i]++;
145e50a8114SHong Zhang     }
146e50a8114SHong Zhang   }
147e50a8114SHong Zhang   /* Create and fill new matrix */
148e50a8114SHong Zhang   if (scall == MAT_REUSE_MATRIX) {
149e50a8114SHong Zhang     c = (Mat_SeqSBAIJ*)((*B)->data);
150e50a8114SHong Zhang 
151e50a8114SHong Zhang     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
152580bdb30SBarry Smith     ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr);
153e50a8114SHong Zhang     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
154580bdb30SBarry Smith     ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr);
155e50a8114SHong Zhang     C    = *B;
156e50a8114SHong Zhang   } else {
157e50a8114SHong Zhang     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
158e50a8114SHong Zhang     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
159e50a8114SHong Zhang     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
160367daffbSBarry Smith     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
161e50a8114SHong Zhang   }
162e50a8114SHong Zhang   c = (Mat_SeqSBAIJ*)(C->data);
163e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
164e50a8114SHong Zhang     row      = irow[i];
165e50a8114SHong Zhang     kstart   = ai[row];
166e50a8114SHong Zhang     kend     = kstart + a->ilen[row];
167e50a8114SHong Zhang     mat_i    = c->i[i];
168e50a8114SHong Zhang     mat_j    = c->j + mat_i;
169e50a8114SHong Zhang     mat_a    = c->a + mat_i*bs2;
170e50a8114SHong Zhang     mat_ilen = c->ilen + i;
171e50a8114SHong Zhang     for (k=kstart; k<kend; k++) {
172e50a8114SHong Zhang       if ((tcol=ssmap[a->j[k]])) {
173e50a8114SHong Zhang         *mat_j++ = tcol - 1;
174580bdb30SBarry Smith         ierr     = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr);
175e50a8114SHong Zhang         mat_a   += bs2;
176e50a8114SHong Zhang         (*mat_ilen)++;
177e50a8114SHong Zhang       }
178e50a8114SHong Zhang     }
179e50a8114SHong Zhang   }
180e50a8114SHong Zhang   /* sort */
181e50a8114SHong Zhang   {
182e50a8114SHong Zhang     MatScalar *work;
183e50a8114SHong Zhang 
184e50a8114SHong Zhang     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
185e50a8114SHong Zhang     for (i=0; i<nrows; i++) {
186e50a8114SHong Zhang       PetscInt ilen;
187e50a8114SHong Zhang       mat_i = c->i[i];
188e50a8114SHong Zhang       mat_j = c->j + mat_i;
189e50a8114SHong Zhang       mat_a = c->a + mat_i*bs2;
190e50a8114SHong Zhang       ilen  = c->ilen[i];
191e50a8114SHong Zhang       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
192e50a8114SHong Zhang     }
193e50a8114SHong Zhang     ierr = PetscFree(work);CHKERRQ(ierr);
194e50a8114SHong Zhang   }
195e50a8114SHong Zhang 
196e50a8114SHong Zhang   /* Free work space */
197e50a8114SHong Zhang   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
198e50a8114SHong Zhang   ierr = PetscFree(smap);CHKERRQ(ierr);
199e50a8114SHong Zhang   ierr = PetscFree(lens);CHKERRQ(ierr);
200e50a8114SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
201e50a8114SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
202e50a8114SHong Zhang 
203e50a8114SHong Zhang   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
204e50a8114SHong Zhang   *B   = C;
205e50a8114SHong Zhang   PetscFunctionReturn(0);
206e50a8114SHong Zhang }
207e50a8114SHong Zhang 
2087dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
20949b5e25fSSatish Balay {
210e50a8114SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
211e50a8114SHong Zhang   IS             is1,is2;
2126849ba73SBarry Smith   PetscErrorCode ierr;
213e50a8114SHong Zhang   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
214e50a8114SHong Zhang   const PetscInt *irow,*icol;
21549b5e25fSSatish Balay 
21649b5e25fSSatish Balay   PetscFunctionBegin;
217e50a8114SHong Zhang   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
218e50a8114SHong Zhang   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
219e50a8114SHong Zhang   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
220e50a8114SHong Zhang   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
221e50a8114SHong Zhang 
222e50a8114SHong Zhang   /* Verify if the indices corespond to each element in a block
223e50a8114SHong Zhang    and form the IS with compressed IS */
224e50a8114SHong Zhang   maxmnbs = PetscMax(a->mbs,a->nbs);
225e50a8114SHong Zhang   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
226580bdb30SBarry Smith   ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr);
227e50a8114SHong Zhang   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
228e50a8114SHong Zhang   for (i=0; i<a->mbs; i++) {
229e50a8114SHong Zhang     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
230e50a8114SHong Zhang   }
231e50a8114SHong Zhang   count = 0;
232e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
233e50a8114SHong Zhang     PetscInt j = irow[i] / bs;
234e50a8114SHong Zhang     if ((vary[j]--)==bs) iary[count++] = j;
235e50a8114SHong Zhang   }
236e50a8114SHong Zhang   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
237e50a8114SHong Zhang 
238580bdb30SBarry Smith   ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr);
239e50a8114SHong Zhang   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
240e50a8114SHong Zhang   for (i=0; i<a->nbs; i++) {
241e50a8114SHong Zhang     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
242e50a8114SHong Zhang   }
243e50a8114SHong Zhang   count = 0;
244e50a8114SHong Zhang   for (i=0; i<ncols; i++) {
245e50a8114SHong Zhang     PetscInt j = icol[i] / bs;
246e50a8114SHong Zhang     if ((vary[j]--)==bs) iary[count++] = j;
247e50a8114SHong Zhang   }
248e50a8114SHong Zhang   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
249e50a8114SHong Zhang   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
250e50a8114SHong Zhang   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
251e50a8114SHong Zhang   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
252e50a8114SHong Zhang 
2537dae84e0SHong Zhang   ierr = MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
254e50a8114SHong Zhang   ierr = ISDestroy(&is1);CHKERRQ(ierr);
255e50a8114SHong Zhang   ierr = ISDestroy(&is2);CHKERRQ(ierr);
256847daeccSHong Zhang 
2578f46ffcaSHong Zhang   if (isrow != iscol) {
2588f46ffcaSHong Zhang     PetscBool isequal;
2598f46ffcaSHong Zhang     ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
260847daeccSHong Zhang     if (!isequal) {
261847daeccSHong Zhang       ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr);
2628f46ffcaSHong Zhang     }
26349b5e25fSSatish Balay   }
26449b5e25fSSatish Balay   PetscFunctionReturn(0);
26549b5e25fSSatish Balay }
26649b5e25fSSatish Balay 
2677dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
26849b5e25fSSatish Balay {
2696849ba73SBarry Smith   PetscErrorCode ierr;
27013f74950SBarry Smith   PetscInt       i;
27149b5e25fSSatish Balay 
27249b5e25fSSatish Balay   PetscFunctionBegin;
273e50a8114SHong Zhang   if (scall == MAT_INITIAL_MATRIX) {
274df750dc8SHong Zhang     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
275afebec48SHong Zhang   }
276e50a8114SHong Zhang 
277e50a8114SHong Zhang   for (i=0; i<n; i++) {
2787dae84e0SHong Zhang     ierr = MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
27949b5e25fSSatish Balay   }
28049b5e25fSSatish Balay   PetscFunctionReturn(0);
28149b5e25fSSatish Balay }
28249b5e25fSSatish Balay 
28349b5e25fSSatish Balay /* -------------------------------------------------------*/
28449b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */
28549b5e25fSSatish Balay /* -------------------------------------------------------*/
28649b5e25fSSatish Balay 
287dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
28849b5e25fSSatish Balay {
28949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
290d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,zero=0.0;
291d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
292d9ca1df4SBarry Smith   const MatScalar   *v;
2936849ba73SBarry Smith   PetscErrorCode    ierr;
294d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
295d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
29698c9bda7SSatish Balay   PetscInt          nonzerorow=0;
29749b5e25fSSatish Balay 
29849b5e25fSSatish Balay   PetscFunctionBegin;
2992dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
300*c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
301d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3021ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
30349b5e25fSSatish Balay 
30449b5e25fSSatish Balay   v  = a->a;
30549b5e25fSSatish Balay   xb = x;
30649b5e25fSSatish Balay 
30749b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
30849b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
30949b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
31049b5e25fSSatish Balay     ib          = aj + *ai;
311831a3094SHong Zhang     jmin        = 0;
31298c9bda7SSatish Balay     nonzerorow += (n>0);
3137fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
31449b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
31549b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
316831a3094SHong Zhang       v        += 4; jmin++;
3177fbae186SHong Zhang     }
318444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
319444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
320831a3094SHong Zhang     for (j=jmin; j<n; j++) {
32149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
32249b5e25fSSatish Balay       cval       = ib[j]*2;
32349b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
32449b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
32549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
32649b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
32749b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
32849b5e25fSSatish Balay       v        += 4;
32949b5e25fSSatish Balay     }
33049b5e25fSSatish Balay     xb +=2; ai++;
33149b5e25fSSatish Balay   }
33249b5e25fSSatish Balay 
333d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
3341ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
335dc0b31edSSatish Balay   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
33649b5e25fSSatish Balay   PetscFunctionReturn(0);
33749b5e25fSSatish Balay }
33849b5e25fSSatish Balay 
339dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
34049b5e25fSSatish Balay {
34149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
342d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,zero=0.0;
343d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
344d9ca1df4SBarry Smith   const MatScalar   *v;
3456849ba73SBarry Smith   PetscErrorCode    ierr;
346d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
347d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
34898c9bda7SSatish Balay   PetscInt          nonzerorow=0;
34949b5e25fSSatish Balay 
35049b5e25fSSatish Balay   PetscFunctionBegin;
3512dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
352*c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
353d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3541ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
35549b5e25fSSatish Balay 
35649b5e25fSSatish Balay   v  = a->a;
35749b5e25fSSatish Balay   xb = x;
35849b5e25fSSatish Balay 
35949b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
36049b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
36149b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
36249b5e25fSSatish Balay     ib          = aj + *ai;
363831a3094SHong Zhang     jmin        = 0;
36498c9bda7SSatish Balay     nonzerorow += (n>0);
3657fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
36649b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
36749b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
36849b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
369831a3094SHong Zhang       v        += 9; jmin++;
3707fbae186SHong Zhang     }
371444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
372444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
373831a3094SHong Zhang     for (j=jmin; j<n; j++) {
37449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
37549b5e25fSSatish Balay       cval       = ib[j]*3;
37649b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
37749b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
37849b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
37949b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
38049b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
38149b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
38249b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
38349b5e25fSSatish Balay       v        += 9;
38449b5e25fSSatish Balay     }
38549b5e25fSSatish Balay     xb +=3; ai++;
38649b5e25fSSatish Balay   }
38749b5e25fSSatish Balay 
388d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
3891ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
390dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
39149b5e25fSSatish Balay   PetscFunctionReturn(0);
39249b5e25fSSatish Balay }
39349b5e25fSSatish Balay 
394dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
39549b5e25fSSatish Balay {
39649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
397d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
398d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
399d9ca1df4SBarry Smith   const MatScalar   *v;
4006849ba73SBarry Smith   PetscErrorCode    ierr;
401d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
402d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
40398c9bda7SSatish Balay   PetscInt          nonzerorow = 0;
40449b5e25fSSatish Balay 
40549b5e25fSSatish Balay   PetscFunctionBegin;
4062dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
407*c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
408d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4091ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
41049b5e25fSSatish Balay 
41149b5e25fSSatish Balay   v  = a->a;
41249b5e25fSSatish Balay   xb = x;
41349b5e25fSSatish Balay 
41449b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
41549b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
41649b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
41749b5e25fSSatish Balay     ib          = aj + *ai;
418831a3094SHong Zhang     jmin        = 0;
41998c9bda7SSatish Balay     nonzerorow += (n>0);
4207fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
42149b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
42249b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
42349b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
42449b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
425831a3094SHong Zhang       v        += 16; jmin++;
4267fbae186SHong Zhang     }
427444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
428444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
429831a3094SHong Zhang     for (j=jmin; j<n; j++) {
43049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
43149b5e25fSSatish Balay       cval       = ib[j]*4;
43249b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
43349b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
43449b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
43549b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
43649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
43749b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
43849b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
43949b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
44049b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
44149b5e25fSSatish Balay       v        += 16;
44249b5e25fSSatish Balay     }
44349b5e25fSSatish Balay     xb +=4; ai++;
44449b5e25fSSatish Balay   }
44549b5e25fSSatish Balay 
446d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
4471ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
448dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
44949b5e25fSSatish Balay   PetscFunctionReturn(0);
45049b5e25fSSatish Balay }
45149b5e25fSSatish Balay 
452dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
45349b5e25fSSatish Balay {
45449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
455d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
456d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
457d9ca1df4SBarry Smith   const MatScalar   *v;
4586849ba73SBarry Smith   PetscErrorCode    ierr;
459d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
460d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
46198c9bda7SSatish Balay   PetscInt          nonzerorow=0;
46249b5e25fSSatish Balay 
46349b5e25fSSatish Balay   PetscFunctionBegin;
4642dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
465*c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
466d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4671ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
46849b5e25fSSatish Balay 
46949b5e25fSSatish Balay   v  = a->a;
47049b5e25fSSatish Balay   xb = x;
47149b5e25fSSatish Balay 
47249b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
47349b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
47449b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
47549b5e25fSSatish Balay     ib          = aj + *ai;
476831a3094SHong Zhang     jmin        = 0;
47798c9bda7SSatish Balay     nonzerorow += (n>0);
4787fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
47949b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
48049b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
48149b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
48249b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
48349b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
484831a3094SHong Zhang       v        += 25; jmin++;
4857fbae186SHong Zhang     }
486444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
487444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
488831a3094SHong Zhang     for (j=jmin; j<n; j++) {
48949b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
49049b5e25fSSatish Balay       cval       = ib[j]*5;
49149b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
49249b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
49349b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
49449b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
49549b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
49649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
49749b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
49849b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
49949b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
50049b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
50149b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
50249b5e25fSSatish Balay       v        += 25;
50349b5e25fSSatish Balay     }
50449b5e25fSSatish Balay     xb +=5; ai++;
50549b5e25fSSatish Balay   }
50649b5e25fSSatish Balay 
507d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5081ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
509dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
51049b5e25fSSatish Balay   PetscFunctionReturn(0);
51149b5e25fSSatish Balay }
51249b5e25fSSatish Balay 
51349b5e25fSSatish Balay 
514dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
51549b5e25fSSatish Balay {
51649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
517d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
518d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
519d9ca1df4SBarry Smith   const MatScalar   *v;
5206849ba73SBarry Smith   PetscErrorCode    ierr;
521d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
522d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
52398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
52449b5e25fSSatish Balay 
52549b5e25fSSatish Balay   PetscFunctionBegin;
5262dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
527*c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
528d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
5291ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
53049b5e25fSSatish Balay 
53149b5e25fSSatish Balay   v  = a->a;
53249b5e25fSSatish Balay   xb = x;
53349b5e25fSSatish Balay 
53449b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
53549b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
53649b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
53749b5e25fSSatish Balay     ib          = aj + *ai;
538831a3094SHong Zhang     jmin        = 0;
53998c9bda7SSatish Balay     nonzerorow += (n>0);
5407fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
54149b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
54249b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
54349b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
54449b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
54549b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
54649b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
547831a3094SHong Zhang       v        += 36; jmin++;
5487fbae186SHong Zhang     }
549444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
550444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
551831a3094SHong Zhang     for (j=jmin; j<n; j++) {
55249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
55349b5e25fSSatish Balay       cval       = ib[j]*6;
55449b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
55549b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
55649b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
55749b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
55849b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
55949b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
56049b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
56149b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
56249b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
56349b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
56449b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
56549b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
56649b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
56749b5e25fSSatish Balay       v        += 36;
56849b5e25fSSatish Balay     }
56949b5e25fSSatish Balay     xb +=6; ai++;
57049b5e25fSSatish Balay   }
57149b5e25fSSatish Balay 
572d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5731ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
574dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
57549b5e25fSSatish Balay   PetscFunctionReturn(0);
57649b5e25fSSatish Balay }
577dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
57849b5e25fSSatish Balay {
57949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
580d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
581d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
582d9ca1df4SBarry Smith   const MatScalar   *v;
5836849ba73SBarry Smith   PetscErrorCode    ierr;
584d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
585d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
58698c9bda7SSatish Balay   PetscInt          nonzerorow=0;
58749b5e25fSSatish Balay 
58849b5e25fSSatish Balay   PetscFunctionBegin;
5892dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
590*c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
591d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
5921ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
59349b5e25fSSatish Balay 
59449b5e25fSSatish Balay   v  = a->a;
59549b5e25fSSatish Balay   xb = x;
59649b5e25fSSatish Balay 
59749b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
59849b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
59949b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
60049b5e25fSSatish Balay     ib          = aj + *ai;
601831a3094SHong Zhang     jmin        = 0;
60298c9bda7SSatish Balay     nonzerorow += (n>0);
6037fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
60449b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
60549b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
60649b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
60749b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
60849b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
60949b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
61049b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
611831a3094SHong Zhang       v        += 49; jmin++;
6127fbae186SHong Zhang     }
613444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
614444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
615831a3094SHong Zhang     for (j=jmin; j<n; j++) {
61649b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
61749b5e25fSSatish Balay       cval       = ib[j]*7;
61849b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
61949b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
62049b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
62149b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
62249b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
62349b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
62449b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
62549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
62649b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
62749b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
62849b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
62949b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
63049b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
63149b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
63249b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
63349b5e25fSSatish Balay       v       += 49;
63449b5e25fSSatish Balay     }
63549b5e25fSSatish Balay     xb +=7; ai++;
63649b5e25fSSatish Balay   }
637d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
6381ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
639dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
64049b5e25fSSatish Balay   PetscFunctionReturn(0);
64149b5e25fSSatish Balay }
64249b5e25fSSatish Balay 
64349b5e25fSSatish Balay /*
64449b5e25fSSatish Balay     This will not work with MatScalar == float because it calls the BLAS
64549b5e25fSSatish Balay */
646dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
64749b5e25fSSatish Balay {
64849b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
649d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
650d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
651d9ca1df4SBarry Smith   const MatScalar   *v;
652dfbe8321SBarry Smith   PetscErrorCode    ierr;
653d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
654d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
65598c9bda7SSatish Balay   PetscInt          nonzerorow=0;
65649b5e25fSSatish Balay 
65749b5e25fSSatish Balay   PetscFunctionBegin;
6582dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
659*c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
660d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x;
6611ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
66249b5e25fSSatish Balay 
66349b5e25fSSatish Balay   aj = a->j;
66449b5e25fSSatish Balay   v  = a->a;
66549b5e25fSSatish Balay   ii = a->i;
66649b5e25fSSatish Balay 
66749b5e25fSSatish Balay   if (!a->mult_work) {
668854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
66949b5e25fSSatish Balay   }
67049b5e25fSSatish Balay   work = a->mult_work;
67149b5e25fSSatish Balay 
67249b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
67349b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
67449b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
67598c9bda7SSatish Balay     nonzerorow += (n>0);
67649b5e25fSSatish Balay 
67749b5e25fSSatish Balay     /* upper triangular part */
67849b5e25fSSatish Balay     for (j=0; j<n; j++) {
67949b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
68049b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
68149b5e25fSSatish Balay       workt += bs;
68249b5e25fSSatish Balay     }
68349b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
68496b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
68549b5e25fSSatish Balay 
68649b5e25fSSatish Balay     /* strict lower triangular part */
687831a3094SHong Zhang     idx = aj+ii[0];
688831a3094SHong Zhang     if (*idx == i) {
68996b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
690831a3094SHong Zhang     }
69196b9376eSHong Zhang 
69249b5e25fSSatish Balay     if (ncols > 0) {
69349b5e25fSSatish Balay       workt = work;
694580bdb30SBarry Smith       ierr  = PetscArrayzero(workt,ncols);CHKERRQ(ierr);
69596b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
696831a3094SHong Zhang       for (j=0; j<n; j++) {
697831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
69849b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
69949b5e25fSSatish Balay         workt += bs;
70049b5e25fSSatish Balay       }
70149b5e25fSSatish Balay     }
70249b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
70349b5e25fSSatish Balay   }
70449b5e25fSSatish Balay 
705d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
7061ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
707dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
70849b5e25fSSatish Balay   PetscFunctionReturn(0);
70949b5e25fSSatish Balay }
71049b5e25fSSatish Balay 
711dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
71249b5e25fSSatish Balay {
71349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
714d9ca1df4SBarry Smith   PetscScalar       *z,x1;
715d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
716d9ca1df4SBarry Smith   const MatScalar   *v;
7176849ba73SBarry Smith   PetscErrorCode    ierr;
718d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
719d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
72098c9bda7SSatish Balay   PetscInt          nonzerorow=0;
72149b5e25fSSatish Balay 
72249b5e25fSSatish Balay   PetscFunctionBegin;
723343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
724*c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
725d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7261ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
72749b5e25fSSatish Balay   v    = a->a;
72849b5e25fSSatish Balay   xb   = x;
72949b5e25fSSatish Balay 
73049b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
73149b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th row of A */
73249b5e25fSSatish Balay     x1          = xb[0];
73349b5e25fSSatish Balay     ib          = aj + *ai;
734831a3094SHong Zhang     jmin        = 0;
73598c9bda7SSatish Balay     nonzerorow += (n>0);
736831a3094SHong Zhang     if (*ib == i) {            /* (diag of A)*x */
737831a3094SHong Zhang       z[i] += *v++ * x[*ib++]; jmin++;
738831a3094SHong Zhang     }
739831a3094SHong Zhang     for (j=jmin; j<n; j++) {
74049b5e25fSSatish Balay       cval    = *ib;
74149b5e25fSSatish Balay       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
74249b5e25fSSatish Balay       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
74349b5e25fSSatish Balay     }
74449b5e25fSSatish Balay     xb++; ai++;
74549b5e25fSSatish Balay   }
74649b5e25fSSatish Balay 
747d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
748bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
74949b5e25fSSatish Balay 
750dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
75149b5e25fSSatish Balay   PetscFunctionReturn(0);
75249b5e25fSSatish Balay }
75349b5e25fSSatish Balay 
754dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
75549b5e25fSSatish Balay {
75649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
757d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2;
758d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
759d9ca1df4SBarry Smith   const MatScalar   *v;
7606849ba73SBarry Smith   PetscErrorCode    ierr;
761d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
762d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
76398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
76449b5e25fSSatish Balay 
76549b5e25fSSatish Balay   PetscFunctionBegin;
766343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
767*c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
768d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7691ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
77049b5e25fSSatish Balay 
77149b5e25fSSatish Balay   v  = a->a;
77249b5e25fSSatish Balay   xb = x;
77349b5e25fSSatish Balay 
77449b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
77549b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
77649b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
77749b5e25fSSatish Balay     ib          = aj + *ai;
778831a3094SHong Zhang     jmin        = 0;
77998c9bda7SSatish Balay     nonzerorow += (n>0);
7807fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
78149b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
78249b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
783831a3094SHong Zhang       v        += 4; jmin++;
7847fbae186SHong Zhang     }
785444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
786444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
787831a3094SHong Zhang     for (j=jmin; j<n; j++) {
78849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
78949b5e25fSSatish Balay       cval       = ib[j]*2;
79049b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
79149b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
79249b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
79349b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
79449b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
79549b5e25fSSatish Balay       v        += 4;
79649b5e25fSSatish Balay     }
79749b5e25fSSatish Balay     xb +=2; ai++;
79849b5e25fSSatish Balay   }
799d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
800bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
80149b5e25fSSatish Balay 
802dc0b31edSSatish Balay   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
80349b5e25fSSatish Balay   PetscFunctionReturn(0);
80449b5e25fSSatish Balay }
80549b5e25fSSatish Balay 
806dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
80749b5e25fSSatish Balay {
80849b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
809d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3;
810d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
811d9ca1df4SBarry Smith   const MatScalar   *v;
8126849ba73SBarry Smith   PetscErrorCode    ierr;
813d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
814d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
81598c9bda7SSatish Balay   PetscInt          nonzerorow=0;
81649b5e25fSSatish Balay 
81749b5e25fSSatish Balay   PetscFunctionBegin;
818343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
819*c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
820d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8211ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
82249b5e25fSSatish Balay 
82349b5e25fSSatish Balay   v  = a->a;
82449b5e25fSSatish Balay   xb = x;
82549b5e25fSSatish Balay 
82649b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
82749b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
82849b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
82949b5e25fSSatish Balay     ib          = aj + *ai;
830831a3094SHong Zhang     jmin        = 0;
83198c9bda7SSatish Balay     nonzerorow += (n>0);
8327fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
83349b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
83449b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
83549b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
836831a3094SHong Zhang       v        += 9; jmin++;
8377fbae186SHong Zhang     }
838444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
839444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
840831a3094SHong Zhang     for (j=jmin; j<n; j++) {
84149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
84249b5e25fSSatish Balay       cval       = ib[j]*3;
84349b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
84449b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
84549b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
84649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
84749b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
84849b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
84949b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
85049b5e25fSSatish Balay       v        += 9;
85149b5e25fSSatish Balay     }
85249b5e25fSSatish Balay     xb +=3; ai++;
85349b5e25fSSatish Balay   }
85449b5e25fSSatish Balay 
855d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
856bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
85749b5e25fSSatish Balay 
858dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
85949b5e25fSSatish Balay   PetscFunctionReturn(0);
86049b5e25fSSatish Balay }
86149b5e25fSSatish Balay 
862dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
86349b5e25fSSatish Balay {
86449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
865d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4;
866d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
867d9ca1df4SBarry Smith   const MatScalar   *v;
8686849ba73SBarry Smith   PetscErrorCode    ierr;
869d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
870d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
87198c9bda7SSatish Balay   PetscInt          nonzerorow=0;
87249b5e25fSSatish Balay 
87349b5e25fSSatish Balay   PetscFunctionBegin;
874343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
875*c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
876d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8771ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
87849b5e25fSSatish Balay 
87949b5e25fSSatish Balay   v  = a->a;
88049b5e25fSSatish Balay   xb = x;
88149b5e25fSSatish Balay 
88249b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
88349b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
88449b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
88549b5e25fSSatish Balay     ib          = aj + *ai;
886831a3094SHong Zhang     jmin        = 0;
88798c9bda7SSatish Balay     nonzerorow += (n>0);
8887fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
88949b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
89049b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
89149b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
89249b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
893831a3094SHong Zhang       v        += 16; jmin++;
8947fbae186SHong Zhang     }
895444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
896444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
897831a3094SHong Zhang     for (j=jmin; j<n; j++) {
89849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
89949b5e25fSSatish Balay       cval       = ib[j]*4;
90049b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
90149b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
90249b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
90349b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
90449b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
90549b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
90649b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
90749b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
90849b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
90949b5e25fSSatish Balay       v        += 16;
91049b5e25fSSatish Balay     }
91149b5e25fSSatish Balay     xb +=4; ai++;
91249b5e25fSSatish Balay   }
91349b5e25fSSatish Balay 
914d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
915bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
91649b5e25fSSatish Balay 
917dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
91849b5e25fSSatish Balay   PetscFunctionReturn(0);
91949b5e25fSSatish Balay }
92049b5e25fSSatish Balay 
921dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
92249b5e25fSSatish Balay {
92349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
924d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5;
925d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
926d9ca1df4SBarry Smith   const MatScalar   *v;
9276849ba73SBarry Smith   PetscErrorCode    ierr;
928d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
929d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
93098c9bda7SSatish Balay   PetscInt          nonzerorow=0;
93149b5e25fSSatish Balay 
93249b5e25fSSatish Balay   PetscFunctionBegin;
933343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
934*c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
935d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
9361ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
93749b5e25fSSatish Balay 
93849b5e25fSSatish Balay   v  = a->a;
93949b5e25fSSatish Balay   xb = x;
94049b5e25fSSatish Balay 
94149b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
94249b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
94349b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
94449b5e25fSSatish Balay     ib          = aj + *ai;
945831a3094SHong Zhang     jmin        = 0;
94698c9bda7SSatish Balay     nonzerorow += (n>0);
9477fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
94849b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
94949b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
95049b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
95149b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
95249b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
953831a3094SHong Zhang       v        += 25; jmin++;
9547fbae186SHong Zhang     }
955444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
956444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
957831a3094SHong Zhang     for (j=jmin; j<n; j++) {
95849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
95949b5e25fSSatish Balay       cval       = ib[j]*5;
96049b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
96149b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
96249b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
96349b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
96449b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
96549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
96649b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
96749b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
96849b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
96949b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
97049b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
97149b5e25fSSatish Balay       v        += 25;
97249b5e25fSSatish Balay     }
97349b5e25fSSatish Balay     xb +=5; ai++;
97449b5e25fSSatish Balay   }
97549b5e25fSSatish Balay 
976d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
977bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
97849b5e25fSSatish Balay 
979dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
98049b5e25fSSatish Balay   PetscFunctionReturn(0);
98149b5e25fSSatish Balay }
982dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
98349b5e25fSSatish Balay {
98449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
985d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
986d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
987d9ca1df4SBarry Smith   const MatScalar   *v;
9886849ba73SBarry Smith   PetscErrorCode    ierr;
989d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
990d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
99198c9bda7SSatish Balay   PetscInt          nonzerorow=0;
99249b5e25fSSatish Balay 
99349b5e25fSSatish Balay   PetscFunctionBegin;
994343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
995*c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
996d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
9971ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
99849b5e25fSSatish Balay 
99949b5e25fSSatish Balay   v  = a->a;
100049b5e25fSSatish Balay   xb = x;
100149b5e25fSSatish Balay 
100249b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
100349b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
100449b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
100549b5e25fSSatish Balay     ib          = aj + *ai;
1006831a3094SHong Zhang     jmin        = 0;
100798c9bda7SSatish Balay     nonzerorow += (n>0);
10087fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
100949b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
101049b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
101149b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
101249b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
101349b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
101449b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1015831a3094SHong Zhang       v        += 36; jmin++;
10167fbae186SHong Zhang     }
1017444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1018444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1019831a3094SHong Zhang     for (j=jmin; j<n; j++) {
102049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
102149b5e25fSSatish Balay       cval       = ib[j]*6;
102249b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
102349b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
102449b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
102549b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
102649b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
102749b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
102849b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
102949b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
103049b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
103149b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
103249b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
103349b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
103449b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
103549b5e25fSSatish Balay       v        += 36;
103649b5e25fSSatish Balay     }
103749b5e25fSSatish Balay     xb +=6; ai++;
103849b5e25fSSatish Balay   }
103949b5e25fSSatish Balay 
1040d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1041bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
104249b5e25fSSatish Balay 
1043dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
104449b5e25fSSatish Balay   PetscFunctionReturn(0);
104549b5e25fSSatish Balay }
104649b5e25fSSatish Balay 
1047dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
104849b5e25fSSatish Balay {
104949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1050d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1051d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
1052d9ca1df4SBarry Smith   const MatScalar   *v;
10536849ba73SBarry Smith   PetscErrorCode    ierr;
1054d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1055d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
105698c9bda7SSatish Balay   PetscInt          nonzerorow=0;
105749b5e25fSSatish Balay 
105849b5e25fSSatish Balay   PetscFunctionBegin;
1059343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1060*c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
1061d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
10621ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
106349b5e25fSSatish Balay 
106449b5e25fSSatish Balay   v  = a->a;
106549b5e25fSSatish Balay   xb = x;
106649b5e25fSSatish Balay 
106749b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
106849b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
106949b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
107049b5e25fSSatish Balay     ib          = aj + *ai;
1071831a3094SHong Zhang     jmin        = 0;
107298c9bda7SSatish Balay     nonzerorow += (n>0);
10737fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
107449b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
107549b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
107649b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
107749b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
107849b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
107949b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
108049b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1081831a3094SHong Zhang       v        += 49; jmin++;
10827fbae186SHong Zhang     }
1083444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1084444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1085831a3094SHong Zhang     for (j=jmin; j<n; j++) {
108649b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
108749b5e25fSSatish Balay       cval       = ib[j]*7;
108849b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
108949b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
109049b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
109149b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
109249b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
109349b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
109449b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
109549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
109649b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
109749b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
109849b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
109949b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
110049b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
110149b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
110249b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
110349b5e25fSSatish Balay       v       += 49;
110449b5e25fSSatish Balay     }
110549b5e25fSSatish Balay     xb +=7; ai++;
110649b5e25fSSatish Balay   }
110749b5e25fSSatish Balay 
1108d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1109bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
111049b5e25fSSatish Balay 
1111dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
111249b5e25fSSatish Balay   PetscFunctionReturn(0);
111349b5e25fSSatish Balay }
111449b5e25fSSatish Balay 
1115dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
111649b5e25fSSatish Balay {
111749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1118d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1119d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
1120d9ca1df4SBarry Smith   const MatScalar   *v;
1121dfbe8321SBarry Smith   PetscErrorCode    ierr;
1122d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1123d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
112498c9bda7SSatish Balay   PetscInt          nonzerorow=0;
112549b5e25fSSatish Balay 
112649b5e25fSSatish Balay   PetscFunctionBegin;
1127343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1128*c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
1129d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
11301ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
113149b5e25fSSatish Balay 
113249b5e25fSSatish Balay   aj = a->j;
113349b5e25fSSatish Balay   v  = a->a;
113449b5e25fSSatish Balay   ii = a->i;
113549b5e25fSSatish Balay 
113649b5e25fSSatish Balay   if (!a->mult_work) {
1137854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
113849b5e25fSSatish Balay   }
113949b5e25fSSatish Balay   work = a->mult_work;
114049b5e25fSSatish Balay 
114149b5e25fSSatish Balay 
114249b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
114349b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
114449b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
114598c9bda7SSatish Balay     nonzerorow += (n>0);
114649b5e25fSSatish Balay 
114749b5e25fSSatish Balay     /* upper triangular part */
114849b5e25fSSatish Balay     for (j=0; j<n; j++) {
114949b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
115049b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
115149b5e25fSSatish Balay       workt += bs;
115249b5e25fSSatish Balay     }
115349b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
115496b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
115549b5e25fSSatish Balay 
115649b5e25fSSatish Balay     /* strict lower triangular part */
1157831a3094SHong Zhang     idx = aj+ii[0];
1158831a3094SHong Zhang     if (*idx == i) {
115996b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
1160831a3094SHong Zhang     }
116149b5e25fSSatish Balay     if (ncols > 0) {
116249b5e25fSSatish Balay       workt = work;
1163580bdb30SBarry Smith       ierr  = PetscArrayzero(workt,ncols);CHKERRQ(ierr);
116496b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1165831a3094SHong Zhang       for (j=0; j<n; j++) {
1166831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
116749b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
116849b5e25fSSatish Balay         workt += bs;
116949b5e25fSSatish Balay       }
117049b5e25fSSatish Balay     }
117149b5e25fSSatish Balay 
117249b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
117349b5e25fSSatish Balay   }
117449b5e25fSSatish Balay 
1175d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1176bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
117749b5e25fSSatish Balay 
1178dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
117949b5e25fSSatish Balay   PetscFunctionReturn(0);
118049b5e25fSSatish Balay }
118149b5e25fSSatish Balay 
1182f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
118349b5e25fSSatish Balay {
118449b5e25fSSatish Balay   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1185f4df32b1SMatthew Knepley   PetscScalar    oalpha = alpha;
1186efee365bSSatish Balay   PetscErrorCode ierr;
1187c5df96a5SBarry Smith   PetscBLASInt   one = 1,totalnz;
118849b5e25fSSatish Balay 
118949b5e25fSSatish Balay   PetscFunctionBegin;
1190c5df96a5SBarry Smith   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
11918b83055fSJed Brown   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1192efee365bSSatish Balay   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
119349b5e25fSSatish Balay   PetscFunctionReturn(0);
119449b5e25fSSatish Balay }
119549b5e25fSSatish Balay 
1196dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
119749b5e25fSSatish Balay {
119849b5e25fSSatish Balay   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1199d9ca1df4SBarry Smith   const MatScalar *v       = a->a;
120049b5e25fSSatish Balay   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1201d9ca1df4SBarry Smith   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
12026849ba73SBarry Smith   PetscErrorCode  ierr;
1203d9ca1df4SBarry Smith   const PetscInt  *aj=a->j,*col;
120449b5e25fSSatish Balay 
120549b5e25fSSatish Balay   PetscFunctionBegin;
1206*c40ae873SPierre Jolivet   if (!a->nz) {
1207*c40ae873SPierre Jolivet     *norm = 0.0;
1208*c40ae873SPierre Jolivet     PetscFunctionReturn(0);
1209*c40ae873SPierre Jolivet   }
121049b5e25fSSatish Balay   if (type == NORM_FROBENIUS) {
121149b5e25fSSatish Balay     for (k=0; k<mbs; k++) {
121249b5e25fSSatish Balay       jmin = a->i[k]; jmax = a->i[k+1];
1213831a3094SHong Zhang       col  = aj + jmin;
1214831a3094SHong Zhang       if (*col == k) {         /* diagonal block */
121549b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
121649b5e25fSSatish Balay           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
121749b5e25fSSatish Balay         }
1218831a3094SHong Zhang         jmin++;
1219831a3094SHong Zhang       }
1220831a3094SHong Zhang       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
122149b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
122249b5e25fSSatish Balay           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
122349b5e25fSSatish Balay         }
122449b5e25fSSatish Balay       }
122549b5e25fSSatish Balay     }
12268f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
122751f70360SJed Brown     ierr = PetscLogFlops(2*bs2*a->nz);CHKERRQ(ierr);
12280b8dc8d2SHong Zhang   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1229dcca6d9dSJed Brown     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
12300b8dc8d2SHong Zhang     for (i=0; i<mbs; i++) jl[i] = mbs;
12310b8dc8d2SHong Zhang     il[0] = 0;
123249b5e25fSSatish Balay 
123349b5e25fSSatish Balay     *norm = 0.0;
123449b5e25fSSatish Balay     for (k=0; k<mbs; k++) { /* k_th block row */
123549b5e25fSSatish Balay       for (j=0; j<bs; j++) sum[j]=0.0;
123649b5e25fSSatish Balay       /*-- col sum --*/
123749b5e25fSSatish Balay       i = jl[k]; /* first |A(i,k)| to be added */
1238831a3094SHong Zhang       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1239831a3094SHong Zhang                   at step k */
124049b5e25fSSatish Balay       while (i<mbs) {
124149b5e25fSSatish Balay         nexti = jl[i];  /* next block row to be added */
124249b5e25fSSatish Balay         ik    = il[i];  /* block index of A(i,k) in the array a */
124349b5e25fSSatish Balay         for (j=0; j<bs; j++) {
124449b5e25fSSatish Balay           v = a->a + ik*bs2 + j*bs;
124549b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
124649b5e25fSSatish Balay             sum[j] += PetscAbsScalar(*v); v++;
124749b5e25fSSatish Balay           }
124849b5e25fSSatish Balay         }
124949b5e25fSSatish Balay         /* update il, jl */
1250831a3094SHong Zhang         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1251831a3094SHong Zhang         jmax = a->i[i+1];
125249b5e25fSSatish Balay         if (jmin < jmax) {
125349b5e25fSSatish Balay           il[i] = jmin;
125449b5e25fSSatish Balay           j     = a->j[jmin];
125549b5e25fSSatish Balay           jl[i] = jl[j]; jl[j]=i;
125649b5e25fSSatish Balay         }
125749b5e25fSSatish Balay         i = nexti;
125849b5e25fSSatish Balay       }
125949b5e25fSSatish Balay       /*-- row sum --*/
126049b5e25fSSatish Balay       jmin = a->i[k]; jmax = a->i[k+1];
126149b5e25fSSatish Balay       for (i=jmin; i<jmax; i++) {
126249b5e25fSSatish Balay         for (j=0; j<bs; j++) {
126349b5e25fSSatish Balay           v = a->a + i*bs2 + j;
126449b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
12650b8dc8d2SHong Zhang             sum[j] += PetscAbsScalar(*v); v += bs;
126649b5e25fSSatish Balay           }
126749b5e25fSSatish Balay         }
126849b5e25fSSatish Balay       }
126949b5e25fSSatish Balay       /* add k_th block row to il, jl */
1270831a3094SHong Zhang       col = aj+jmin;
1271831a3094SHong Zhang       if (*col == k) jmin++;
127249b5e25fSSatish Balay       if (jmin < jmax) {
127349b5e25fSSatish Balay         il[k] = jmin;
12740b8dc8d2SHong Zhang         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
127549b5e25fSSatish Balay       }
127649b5e25fSSatish Balay       for (j=0; j<bs; j++) {
127749b5e25fSSatish Balay         if (sum[j] > *norm) *norm = sum[j];
127849b5e25fSSatish Balay       }
127949b5e25fSSatish Balay     }
128074ed9c26SBarry Smith     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
128151f70360SJed Brown     ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr);
1282f23aa3ddSBarry Smith   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
128349b5e25fSSatish Balay   PetscFunctionReturn(0);
128449b5e25fSSatish Balay }
128549b5e25fSSatish Balay 
1286ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
128749b5e25fSSatish Balay {
128849b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1289dfbe8321SBarry Smith   PetscErrorCode ierr;
129049b5e25fSSatish Balay 
129149b5e25fSSatish Balay   PetscFunctionBegin;
129249b5e25fSSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1293d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1294ef511fbeSHong Zhang     *flg = PETSC_FALSE;
1295ef511fbeSHong Zhang     PetscFunctionReturn(0);
129649b5e25fSSatish Balay   }
129749b5e25fSSatish Balay 
129849b5e25fSSatish Balay   /* if the a->i are the same */
1299580bdb30SBarry Smith   ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr);
130026fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
130149b5e25fSSatish Balay 
130249b5e25fSSatish Balay   /* if a->j are the same */
1303580bdb30SBarry Smith   ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr);
130426fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
130526fbe8dcSKarl Rupp 
130649b5e25fSSatish Balay   /* if a->a are the same */
1307580bdb30SBarry Smith   ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg);CHKERRQ(ierr);
1308935af2e7SHong Zhang   PetscFunctionReturn(0);
130949b5e25fSSatish Balay }
131049b5e25fSSatish Balay 
1311dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
131249b5e25fSSatish Balay {
131349b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1314dfbe8321SBarry Smith   PetscErrorCode  ierr;
1315d9ca1df4SBarry Smith   PetscInt        i,j,k,row,bs,ambs,bs2;
1316d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
131787828ca2SBarry Smith   PetscScalar     *x,zero = 0.0;
1318d9ca1df4SBarry Smith   const MatScalar *aa,*aa_j;
131949b5e25fSSatish Balay 
132049b5e25fSSatish Balay   PetscFunctionBegin;
1321d0f46423SBarry Smith   bs = A->rmap->bs;
1322e32f2f54SBarry Smith   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
132382799104SHong Zhang 
132449b5e25fSSatish Balay   aa   = a->a;
13258a0c6efdSHong Zhang   ambs = a->mbs;
13268a0c6efdSHong Zhang 
13278a0c6efdSHong Zhang   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
13288a0c6efdSHong Zhang     PetscInt *diag=a->diag;
13298a0c6efdSHong Zhang     aa   = a->a;
13308a0c6efdSHong Zhang     ambs = a->mbs;
13318a0c6efdSHong Zhang     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
13328a0c6efdSHong Zhang     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
13338a0c6efdSHong Zhang     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
13348a0c6efdSHong Zhang     PetscFunctionReturn(0);
13358a0c6efdSHong Zhang   }
13368a0c6efdSHong Zhang 
133749b5e25fSSatish Balay   ai   = a->i;
133849b5e25fSSatish Balay   aj   = a->j;
133949b5e25fSSatish Balay   bs2  = a->bs2;
13402dcb1b2aSMatthew Knepley   ierr = VecSet(v,zero);CHKERRQ(ierr);
1341*c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
13421ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
134349b5e25fSSatish Balay   for (i=0; i<ambs; i++) {
134449b5e25fSSatish Balay     j=ai[i];
134549b5e25fSSatish Balay     if (aj[j] == i) {    /* if this is a diagonal element */
134649b5e25fSSatish Balay       row  = i*bs;
134749b5e25fSSatish Balay       aa_j = aa + j*bs2;
134849b5e25fSSatish Balay       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
134949b5e25fSSatish Balay     }
135049b5e25fSSatish Balay   }
13511ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
135249b5e25fSSatish Balay   PetscFunctionReturn(0);
135349b5e25fSSatish Balay }
135449b5e25fSSatish Balay 
1355dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
135649b5e25fSSatish Balay {
135749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1358d9ca1df4SBarry Smith   PetscScalar       x;
1359d9ca1df4SBarry Smith   const PetscScalar *l,*li,*ri;
136049b5e25fSSatish Balay   MatScalar         *aa,*v;
1361dfbe8321SBarry Smith   PetscErrorCode    ierr;
1362fff8e43fSBarry Smith   PetscInt          i,j,k,lm,M,m,mbs,tmp,bs,bs2;
1363fff8e43fSBarry Smith   const PetscInt    *ai,*aj;
1364ace3abfcSBarry Smith   PetscBool         flg;
136549b5e25fSSatish Balay 
136649b5e25fSSatish Balay   PetscFunctionBegin;
1367b3bf805bSHong Zhang   if (ll != rr) {
1368b3bf805bSHong Zhang     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1369e7e72b3dSBarry Smith     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1370b3bf805bSHong Zhang   }
1371b3bf805bSHong Zhang   if (!ll) PetscFunctionReturn(0);
137249b5e25fSSatish Balay   ai  = a->i;
137349b5e25fSSatish Balay   aj  = a->j;
137449b5e25fSSatish Balay   aa  = a->a;
1375d0f46423SBarry Smith   m   = A->rmap->N;
1376d0f46423SBarry Smith   bs  = A->rmap->bs;
137749b5e25fSSatish Balay   mbs = a->mbs;
137849b5e25fSSatish Balay   bs2 = a->bs2;
137949b5e25fSSatish Balay 
1380d9ca1df4SBarry Smith   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
138149b5e25fSSatish Balay   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1382e32f2f54SBarry Smith   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
138349b5e25fSSatish Balay   for (i=0; i<mbs; i++) { /* for each block row */
138449b5e25fSSatish Balay     M  = ai[i+1] - ai[i];
138549b5e25fSSatish Balay     li = l + i*bs;
138649b5e25fSSatish Balay     v  = aa + bs2*ai[i];
138749b5e25fSSatish Balay     for (j=0; j<M; j++) { /* for each block */
138849b5e25fSSatish Balay       ri = l + bs*aj[ai[i]+j];
13895e90f9d9SHong Zhang       for (k=0; k<bs; k++) {
139049b5e25fSSatish Balay         x = ri[k];
139149b5e25fSSatish Balay         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
139249b5e25fSSatish Balay       }
139349b5e25fSSatish Balay     }
139449b5e25fSSatish Balay   }
1395d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1396dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
139749b5e25fSSatish Balay   PetscFunctionReturn(0);
139849b5e25fSSatish Balay }
139949b5e25fSSatish Balay 
1400dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
140149b5e25fSSatish Balay {
140249b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
140349b5e25fSSatish Balay 
140449b5e25fSSatish Balay   PetscFunctionBegin;
140549b5e25fSSatish Balay   info->block_size   = a->bs2;
1406ceed8ce5SJed Brown   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
14076c6c5352SBarry Smith   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
140849b5e25fSSatish Balay   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
140949b5e25fSSatish Balay   info->assemblies   = A->num_ass;
14108e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
14117adad957SLisandro Dalcin   info->memory       = ((PetscObject)A)->mem;
1412d5f3da31SBarry Smith   if (A->factortype) {
141349b5e25fSSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
141449b5e25fSSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
141549b5e25fSSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
141649b5e25fSSatish Balay   } else {
141749b5e25fSSatish Balay     info->fill_ratio_given  = 0;
141849b5e25fSSatish Balay     info->fill_ratio_needed = 0;
141949b5e25fSSatish Balay     info->factor_mallocs    = 0;
142049b5e25fSSatish Balay   }
142149b5e25fSSatish Balay   PetscFunctionReturn(0);
142249b5e25fSSatish Balay }
142349b5e25fSSatish Balay 
142449b5e25fSSatish Balay 
1425dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
142649b5e25fSSatish Balay {
142749b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1428dfbe8321SBarry Smith   PetscErrorCode ierr;
142949b5e25fSSatish Balay 
143049b5e25fSSatish Balay   PetscFunctionBegin;
1431580bdb30SBarry Smith   ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr);
143249b5e25fSSatish Balay   PetscFunctionReturn(0);
143349b5e25fSSatish Balay }
1434dc354874SHong Zhang 
1435985db425SBarry Smith /*
1436985db425SBarry Smith    This code does not work since it only checks the upper triangular part of
1437985db425SBarry Smith   the matrix. Hence it is not listed in the function table.
1438985db425SBarry Smith */
1439985db425SBarry Smith PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1440dc354874SHong Zhang {
1441dc354874SHong Zhang   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1442dfbe8321SBarry Smith   PetscErrorCode  ierr;
1443d9ca1df4SBarry Smith   PetscInt        i,j,n,row,col,bs,mbs;
1444d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
1445c3fca9a7SHong Zhang   PetscReal       atmp;
1446d9ca1df4SBarry Smith   const MatScalar *aa;
1447985db425SBarry Smith   PetscScalar     *x;
144813f74950SBarry Smith   PetscInt        ncols,brow,bcol,krow,kcol;
1449dc354874SHong Zhang 
1450dc354874SHong Zhang   PetscFunctionBegin;
1451e32f2f54SBarry Smith   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1452e32f2f54SBarry Smith   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1453d0f46423SBarry Smith   bs  = A->rmap->bs;
1454dc354874SHong Zhang   aa  = a->a;
1455dc354874SHong Zhang   ai  = a->i;
1456dc354874SHong Zhang   aj  = a->j;
145744117c81SHong Zhang   mbs = a->mbs;
1458dc354874SHong Zhang 
1459985db425SBarry Smith   ierr = VecSet(v,0.0);CHKERRQ(ierr);
14601ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1461dc354874SHong Zhang   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1462e32f2f54SBarry Smith   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
146344117c81SHong Zhang   for (i=0; i<mbs; i++) {
1464d0f6400bSHong Zhang     ncols = ai[1] - ai[0]; ai++;
1465d0f6400bSHong Zhang     brow  = bs*i;
146644117c81SHong Zhang     for (j=0; j<ncols; j++) {
1467d0f6400bSHong Zhang       bcol = bs*(*aj);
146844117c81SHong Zhang       for (kcol=0; kcol<bs; kcol++) {
1469d0f6400bSHong Zhang         col = bcol + kcol;      /* col index */
147044117c81SHong Zhang         for (krow=0; krow<bs; krow++) {
1471d0f6400bSHong Zhang           atmp = PetscAbsScalar(*aa); aa++;
1472d0f6400bSHong Zhang           row  = brow + krow;   /* row index */
1473c3fca9a7SHong Zhang           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1474c3fca9a7SHong Zhang           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
147544117c81SHong Zhang         }
147644117c81SHong Zhang       }
1477d0f6400bSHong Zhang       aj++;
1478dc354874SHong Zhang     }
1479dc354874SHong Zhang   }
14801ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1481dc354874SHong Zhang   PetscFunctionReturn(0);
1482dc354874SHong Zhang }
1483