xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 1e1ea65d8de51fde77ce8a787efbef25e407badc)
149b5e25fSSatish Balay 
2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
3c2916339SPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h>
4c2916339SPierre Jolivet #include <../src/mat/impls/sbaij/seq/sbaij.h>
5af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
6c6db04a5SJed Brown #include <petscbt.h>
7c6db04a5SJed Brown #include <petscblaslapack.h>
849b5e25fSSatish Balay 
913f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
1049b5e25fSSatish Balay {
115eee224dSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
126849ba73SBarry Smith   PetscErrorCode ierr;
135d0c19d7SBarry Smith   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
145d0c19d7SBarry Smith   const PetscInt *idx;
15db41cccfSHong Zhang   PetscBT        table_out,table_in;
16d94109b8SHong Zhang 
17d94109b8SHong Zhang   PetscFunctionBegin;
18e32f2f54SBarry Smith   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
19d94109b8SHong Zhang   mbs  = a->mbs;
20d94109b8SHong Zhang   ai   = a->i;
21d94109b8SHong Zhang   aj   = a->j;
22d0f46423SBarry Smith   bs   = A->rmap->bs;
2353b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
24854ce69bSBarry Smith   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
25854ce69bSBarry Smith   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
2653b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
27d94109b8SHong Zhang 
28d94109b8SHong Zhang   for (i=0; i<is_max; i++) { /* for each is */
29d94109b8SHong Zhang     isz  = 0;
30db41cccfSHong Zhang     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
31d94109b8SHong Zhang 
32d94109b8SHong Zhang     /* Extract the indices, assume there can be duplicate entries */
33d94109b8SHong Zhang     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
34d94109b8SHong Zhang     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
35d94109b8SHong Zhang 
36db41cccfSHong Zhang     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
37dbe03f88SHong Zhang     bcol_max = 0;
38d94109b8SHong Zhang     for (j=0; j<n; ++j) {
39d94109b8SHong Zhang       brow = idx[j]/bs; /* convert the indices into block indices */
40e32f2f54SBarry Smith       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
41db41cccfSHong Zhang       if (!PetscBTLookupSet(table_out,brow)) {
42dbe03f88SHong Zhang         nidx[isz++] = brow;
43dbe03f88SHong Zhang         if (bcol_max < brow) bcol_max = brow;
44dbe03f88SHong Zhang       }
45d94109b8SHong Zhang     }
46d94109b8SHong Zhang     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
476bf464f9SBarry Smith     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
48d94109b8SHong Zhang 
49d94109b8SHong Zhang     k = 0;
50d94109b8SHong Zhang     for (j=0; j<ov; j++) { /* for each overlap */
51db41cccfSHong Zhang       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
52db41cccfSHong Zhang       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
53db41cccfSHong Zhang       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
54d94109b8SHong Zhang 
55d94109b8SHong Zhang       n = isz;  /* length of the updated is[i] */
56d94109b8SHong Zhang       for (brow=0; brow<mbs; brow++) {
57d94109b8SHong Zhang         start = ai[brow]; end   = ai[brow+1];
58db41cccfSHong Zhang         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
59d94109b8SHong Zhang           for (l = start; l<end; l++) {
60d94109b8SHong Zhang             bcol = aj[l];
61d7b97159SHong Zhang             if (!PetscBTLookupSet(table_out,bcol)) {
62d7b97159SHong Zhang               nidx[isz++] = bcol;
63d7b97159SHong Zhang               if (bcol_max < bcol) bcol_max = bcol;
64d7b97159SHong Zhang             }
65d94109b8SHong Zhang           }
66d94109b8SHong Zhang           k++;
67d94109b8SHong Zhang           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
68d94109b8SHong Zhang         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
69d94109b8SHong Zhang           for (l = start; l<end; l++) {
70d94109b8SHong Zhang             bcol = aj[l];
71dbe03f88SHong Zhang             if (bcol > bcol_max) break;
72db41cccfSHong Zhang             if (PetscBTLookup(table_in,bcol)) {
7326fbe8dcSKarl Rupp               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
74d94109b8SHong Zhang               break; /* for l = start; l<end ; l++) */
75d94109b8SHong Zhang             }
76d94109b8SHong Zhang           }
77d94109b8SHong Zhang         }
78d94109b8SHong Zhang       }
79d94109b8SHong Zhang     } /* for each overlap */
80d94109b8SHong Zhang 
81d94109b8SHong Zhang     /* expand the Index Set */
82d94109b8SHong Zhang     for (j=0; j<isz; j++) {
8326fbe8dcSKarl Rupp       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
84d94109b8SHong Zhang     }
8570b3c8c7SBarry Smith     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
86d94109b8SHong Zhang   }
8794bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
88d94109b8SHong Zhang   ierr = PetscFree(nidx);CHKERRQ(ierr);
89d94109b8SHong Zhang   ierr = PetscFree(nidx2);CHKERRQ(ierr);
9094bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
915eee224dSHong Zhang   PetscFunctionReturn(0);
9249b5e25fSSatish Balay }
9349b5e25fSSatish Balay 
94847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
95847daeccSHong Zhang         Zero some ops' to avoid invalid usse */
96847daeccSHong Zhang PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
9749b5e25fSSatish Balay {
986849ba73SBarry Smith   PetscErrorCode ierr;
9949b5e25fSSatish Balay 
10049b5e25fSSatish Balay   PetscFunctionBegin;
101847daeccSHong Zhang   ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr);
102f4259b30SLisandro Dalcin   Bseq->ops->mult                   = NULL;
103f4259b30SLisandro Dalcin   Bseq->ops->multadd                = NULL;
104f4259b30SLisandro Dalcin   Bseq->ops->multtranspose          = NULL;
105f4259b30SLisandro Dalcin   Bseq->ops->multtransposeadd       = NULL;
106f4259b30SLisandro Dalcin   Bseq->ops->lufactor               = NULL;
107f4259b30SLisandro Dalcin   Bseq->ops->choleskyfactor         = NULL;
108f4259b30SLisandro Dalcin   Bseq->ops->lufactorsymbolic       = NULL;
109f4259b30SLisandro Dalcin   Bseq->ops->choleskyfactorsymbolic = NULL;
110f4259b30SLisandro Dalcin   Bseq->ops->getinertia             = NULL;
11149b5e25fSSatish Balay   PetscFunctionReturn(0);
11249b5e25fSSatish Balay }
11349b5e25fSSatish Balay 
1147dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
1157dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
116e50a8114SHong Zhang {
117e50a8114SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*c;
118e50a8114SHong Zhang   PetscErrorCode ierr;
119e50a8114SHong Zhang   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
120e50a8114SHong Zhang   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
121e50a8114SHong Zhang   const PetscInt *irow,*icol;
122e50a8114SHong Zhang   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
123e50a8114SHong Zhang   PetscInt       *aj = a->j,*ai = a->i;
124e50a8114SHong Zhang   MatScalar      *mat_a;
125e50a8114SHong Zhang   Mat            C;
126e50a8114SHong Zhang   PetscBool      flag;
127e50a8114SHong Zhang 
128e50a8114SHong Zhang   PetscFunctionBegin;
129e50a8114SHong Zhang 
130e50a8114SHong Zhang   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
131e50a8114SHong Zhang   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
132e50a8114SHong Zhang   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
133e50a8114SHong Zhang   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
134e50a8114SHong Zhang 
135e50a8114SHong Zhang   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
136e50a8114SHong Zhang   ssmap = smap;
137e50a8114SHong Zhang   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
138e50a8114SHong Zhang   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
139e50a8114SHong Zhang   /* determine lens of each row */
140e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
141e50a8114SHong Zhang     kstart  = ai[irow[i]];
142e50a8114SHong Zhang     kend    = kstart + a->ilen[irow[i]];
143e50a8114SHong Zhang     lens[i] = 0;
144e50a8114SHong Zhang     for (k=kstart; k<kend; k++) {
145e50a8114SHong Zhang       if (ssmap[aj[k]]) lens[i]++;
146e50a8114SHong Zhang     }
147e50a8114SHong Zhang   }
148e50a8114SHong Zhang   /* Create and fill new matrix */
149e50a8114SHong Zhang   if (scall == MAT_REUSE_MATRIX) {
150e50a8114SHong Zhang     c = (Mat_SeqSBAIJ*)((*B)->data);
151e50a8114SHong Zhang 
152e50a8114SHong Zhang     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
153580bdb30SBarry Smith     ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr);
154e50a8114SHong Zhang     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
155580bdb30SBarry Smith     ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr);
156e50a8114SHong Zhang     C    = *B;
157e50a8114SHong Zhang   } else {
158e50a8114SHong Zhang     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
159e50a8114SHong Zhang     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
160e50a8114SHong Zhang     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
161367daffbSBarry Smith     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
162e50a8114SHong Zhang   }
163e50a8114SHong Zhang   c = (Mat_SeqSBAIJ*)(C->data);
164e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
165e50a8114SHong Zhang     row      = irow[i];
166e50a8114SHong Zhang     kstart   = ai[row];
167e50a8114SHong Zhang     kend     = kstart + a->ilen[row];
168e50a8114SHong Zhang     mat_i    = c->i[i];
169e50a8114SHong Zhang     mat_j    = c->j + mat_i;
170e50a8114SHong Zhang     mat_a    = c->a + mat_i*bs2;
171e50a8114SHong Zhang     mat_ilen = c->ilen + i;
172e50a8114SHong Zhang     for (k=kstart; k<kend; k++) {
173e50a8114SHong Zhang       if ((tcol=ssmap[a->j[k]])) {
174e50a8114SHong Zhang         *mat_j++ = tcol - 1;
175580bdb30SBarry Smith         ierr     = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr);
176e50a8114SHong Zhang         mat_a   += bs2;
177e50a8114SHong Zhang         (*mat_ilen)++;
178e50a8114SHong Zhang       }
179e50a8114SHong Zhang     }
180e50a8114SHong Zhang   }
181e50a8114SHong Zhang   /* sort */
182e50a8114SHong Zhang   {
183e50a8114SHong Zhang     MatScalar *work;
184e50a8114SHong Zhang 
185e50a8114SHong Zhang     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
186e50a8114SHong Zhang     for (i=0; i<nrows; i++) {
187e50a8114SHong Zhang       PetscInt ilen;
188e50a8114SHong Zhang       mat_i = c->i[i];
189e50a8114SHong Zhang       mat_j = c->j + mat_i;
190e50a8114SHong Zhang       mat_a = c->a + mat_i*bs2;
191e50a8114SHong Zhang       ilen  = c->ilen[i];
192e50a8114SHong Zhang       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
193e50a8114SHong Zhang     }
194e50a8114SHong Zhang     ierr = PetscFree(work);CHKERRQ(ierr);
195e50a8114SHong Zhang   }
196e50a8114SHong Zhang 
197e50a8114SHong Zhang   /* Free work space */
198e50a8114SHong Zhang   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
199e50a8114SHong Zhang   ierr = PetscFree(smap);CHKERRQ(ierr);
200e50a8114SHong Zhang   ierr = PetscFree(lens);CHKERRQ(ierr);
201e50a8114SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
202e50a8114SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
203e50a8114SHong Zhang 
204e50a8114SHong Zhang   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
205e50a8114SHong Zhang   *B   = C;
206e50a8114SHong Zhang   PetscFunctionReturn(0);
207e50a8114SHong Zhang }
208e50a8114SHong Zhang 
2097dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
21049b5e25fSSatish Balay {
211e50a8114SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
212e50a8114SHong Zhang   IS             is1,is2;
2136849ba73SBarry Smith   PetscErrorCode ierr;
214e50a8114SHong Zhang   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
215e50a8114SHong Zhang   const PetscInt *irow,*icol;
21649b5e25fSSatish Balay 
21749b5e25fSSatish Balay   PetscFunctionBegin;
218e50a8114SHong Zhang   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
219e50a8114SHong Zhang   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
220e50a8114SHong Zhang   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
221e50a8114SHong Zhang   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
222e50a8114SHong Zhang 
223e50a8114SHong Zhang   /* Verify if the indices corespond to each element in a block
224e50a8114SHong Zhang    and form the IS with compressed IS */
225e50a8114SHong Zhang   maxmnbs = PetscMax(a->mbs,a->nbs);
226e50a8114SHong Zhang   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
227580bdb30SBarry Smith   ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr);
228e50a8114SHong Zhang   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
229e50a8114SHong Zhang   for (i=0; i<a->mbs; i++) {
230e50a8114SHong Zhang     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
231e50a8114SHong Zhang   }
232e50a8114SHong Zhang   count = 0;
233e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
234e50a8114SHong Zhang     PetscInt j = irow[i] / bs;
235e50a8114SHong Zhang     if ((vary[j]--)==bs) iary[count++] = j;
236e50a8114SHong Zhang   }
237e50a8114SHong Zhang   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
238e50a8114SHong Zhang 
239580bdb30SBarry Smith   ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr);
240e50a8114SHong Zhang   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
241e50a8114SHong Zhang   for (i=0; i<a->nbs; i++) {
242e50a8114SHong Zhang     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
243e50a8114SHong Zhang   }
244e50a8114SHong Zhang   count = 0;
245e50a8114SHong Zhang   for (i=0; i<ncols; i++) {
246e50a8114SHong Zhang     PetscInt j = icol[i] / bs;
247e50a8114SHong Zhang     if ((vary[j]--)==bs) iary[count++] = j;
248e50a8114SHong Zhang   }
249e50a8114SHong Zhang   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
250e50a8114SHong Zhang   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
251e50a8114SHong Zhang   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
252e50a8114SHong Zhang   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
253e50a8114SHong Zhang 
2547dae84e0SHong Zhang   ierr = MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
255e50a8114SHong Zhang   ierr = ISDestroy(&is1);CHKERRQ(ierr);
256e50a8114SHong Zhang   ierr = ISDestroy(&is2);CHKERRQ(ierr);
257847daeccSHong Zhang 
2588f46ffcaSHong Zhang   if (isrow != iscol) {
2598f46ffcaSHong Zhang     PetscBool isequal;
2608f46ffcaSHong Zhang     ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
261847daeccSHong Zhang     if (!isequal) {
262847daeccSHong Zhang       ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr);
2638f46ffcaSHong Zhang     }
26449b5e25fSSatish Balay   }
26549b5e25fSSatish Balay   PetscFunctionReturn(0);
26649b5e25fSSatish Balay }
26749b5e25fSSatish Balay 
2687dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
26949b5e25fSSatish Balay {
2706849ba73SBarry Smith   PetscErrorCode ierr;
27113f74950SBarry Smith   PetscInt       i;
27249b5e25fSSatish Balay 
27349b5e25fSSatish Balay   PetscFunctionBegin;
274e50a8114SHong Zhang   if (scall == MAT_INITIAL_MATRIX) {
275df750dc8SHong Zhang     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
276afebec48SHong Zhang   }
277e50a8114SHong Zhang 
278e50a8114SHong Zhang   for (i=0; i<n; i++) {
2797dae84e0SHong Zhang     ierr = MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
28049b5e25fSSatish Balay   }
28149b5e25fSSatish Balay   PetscFunctionReturn(0);
28249b5e25fSSatish Balay }
28349b5e25fSSatish Balay 
28449b5e25fSSatish Balay /* -------------------------------------------------------*/
28549b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */
28649b5e25fSSatish Balay /* -------------------------------------------------------*/
28749b5e25fSSatish Balay 
288dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
28949b5e25fSSatish Balay {
29049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
291d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,zero=0.0;
292d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
293d9ca1df4SBarry Smith   const MatScalar   *v;
2946849ba73SBarry Smith   PetscErrorCode    ierr;
295d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
296d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
29798c9bda7SSatish Balay   PetscInt          nonzerorow=0;
29849b5e25fSSatish Balay 
29949b5e25fSSatish Balay   PetscFunctionBegin;
3002dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
301c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
302d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3031ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
30449b5e25fSSatish Balay 
30549b5e25fSSatish Balay   v  = a->a;
30649b5e25fSSatish Balay   xb = x;
30749b5e25fSSatish Balay 
30849b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
30949b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
31049b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
31149b5e25fSSatish Balay     ib          = aj + *ai;
312831a3094SHong Zhang     jmin        = 0;
31398c9bda7SSatish Balay     nonzerorow += (n>0);
3147fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
31549b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
31649b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
317831a3094SHong Zhang       v        += 4; jmin++;
3187fbae186SHong Zhang     }
319444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
320444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
321831a3094SHong Zhang     for (j=jmin; j<n; j++) {
32249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
32349b5e25fSSatish Balay       cval       = ib[j]*2;
32449b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
32549b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
32649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
32749b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
32849b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
32949b5e25fSSatish Balay       v        += 4;
33049b5e25fSSatish Balay     }
33149b5e25fSSatish Balay     xb +=2; ai++;
33249b5e25fSSatish Balay   }
33349b5e25fSSatish Balay 
334d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
3351ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
336dc0b31edSSatish Balay   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
33749b5e25fSSatish Balay   PetscFunctionReturn(0);
33849b5e25fSSatish Balay }
33949b5e25fSSatish Balay 
340dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
34149b5e25fSSatish Balay {
34249b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
343d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,zero=0.0;
344d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
345d9ca1df4SBarry Smith   const MatScalar   *v;
3466849ba73SBarry Smith   PetscErrorCode    ierr;
347d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
348d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
34998c9bda7SSatish Balay   PetscInt          nonzerorow=0;
35049b5e25fSSatish Balay 
35149b5e25fSSatish Balay   PetscFunctionBegin;
3522dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
353c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
354d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3551ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
35649b5e25fSSatish Balay 
35749b5e25fSSatish Balay   v  = a->a;
35849b5e25fSSatish Balay   xb = x;
35949b5e25fSSatish Balay 
36049b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
36149b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
36249b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
36349b5e25fSSatish Balay     ib          = aj + *ai;
364831a3094SHong Zhang     jmin        = 0;
36598c9bda7SSatish Balay     nonzerorow += (n>0);
3667fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
36749b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
36849b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
36949b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
370831a3094SHong Zhang       v        += 9; jmin++;
3717fbae186SHong Zhang     }
372444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
373444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
374831a3094SHong Zhang     for (j=jmin; j<n; j++) {
37549b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
37649b5e25fSSatish Balay       cval       = ib[j]*3;
37749b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
37849b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
37949b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
38049b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
38149b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
38249b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
38349b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
38449b5e25fSSatish Balay       v        += 9;
38549b5e25fSSatish Balay     }
38649b5e25fSSatish Balay     xb +=3; ai++;
38749b5e25fSSatish Balay   }
38849b5e25fSSatish Balay 
389d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
3901ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
391dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
39249b5e25fSSatish Balay   PetscFunctionReturn(0);
39349b5e25fSSatish Balay }
39449b5e25fSSatish Balay 
395dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
39649b5e25fSSatish Balay {
39749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
398d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
399d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
400d9ca1df4SBarry Smith   const MatScalar   *v;
4016849ba73SBarry Smith   PetscErrorCode    ierr;
402d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
403d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
40498c9bda7SSatish Balay   PetscInt          nonzerorow = 0;
40549b5e25fSSatish Balay 
40649b5e25fSSatish Balay   PetscFunctionBegin;
4072dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
408c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
409d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4101ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
41149b5e25fSSatish Balay 
41249b5e25fSSatish Balay   v  = a->a;
41349b5e25fSSatish Balay   xb = x;
41449b5e25fSSatish Balay 
41549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
41649b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
41749b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
41849b5e25fSSatish Balay     ib          = aj + *ai;
419831a3094SHong Zhang     jmin        = 0;
42098c9bda7SSatish Balay     nonzerorow += (n>0);
4217fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
42249b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
42349b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
42449b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
42549b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
426831a3094SHong Zhang       v        += 16; jmin++;
4277fbae186SHong Zhang     }
428444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
429444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
430831a3094SHong Zhang     for (j=jmin; j<n; j++) {
43149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
43249b5e25fSSatish Balay       cval       = ib[j]*4;
43349b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
43449b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
43549b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
43649b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
43749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
43849b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
43949b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
44049b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
44149b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
44249b5e25fSSatish Balay       v        += 16;
44349b5e25fSSatish Balay     }
44449b5e25fSSatish Balay     xb +=4; ai++;
44549b5e25fSSatish Balay   }
44649b5e25fSSatish Balay 
447d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
4481ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
449dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
45049b5e25fSSatish Balay   PetscFunctionReturn(0);
45149b5e25fSSatish Balay }
45249b5e25fSSatish Balay 
453dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
45449b5e25fSSatish Balay {
45549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
456d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
457d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
458d9ca1df4SBarry Smith   const MatScalar   *v;
4596849ba73SBarry Smith   PetscErrorCode    ierr;
460d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
461d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
46298c9bda7SSatish Balay   PetscInt          nonzerorow=0;
46349b5e25fSSatish Balay 
46449b5e25fSSatish Balay   PetscFunctionBegin;
4652dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
466c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
467d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4681ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
46949b5e25fSSatish Balay 
47049b5e25fSSatish Balay   v  = a->a;
47149b5e25fSSatish Balay   xb = x;
47249b5e25fSSatish Balay 
47349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
47449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
47549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
47649b5e25fSSatish Balay     ib          = aj + *ai;
477831a3094SHong Zhang     jmin        = 0;
47898c9bda7SSatish Balay     nonzerorow += (n>0);
4797fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
48049b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
48149b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
48249b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
48349b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
48449b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
485831a3094SHong Zhang       v        += 25; jmin++;
4867fbae186SHong Zhang     }
487444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
488444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
489831a3094SHong Zhang     for (j=jmin; j<n; j++) {
49049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
49149b5e25fSSatish Balay       cval       = ib[j]*5;
49249b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
49349b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
49449b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
49549b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
49649b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
49749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
49849b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
49949b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
50049b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
50149b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
50249b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
50349b5e25fSSatish Balay       v        += 25;
50449b5e25fSSatish Balay     }
50549b5e25fSSatish Balay     xb +=5; ai++;
50649b5e25fSSatish Balay   }
50749b5e25fSSatish Balay 
508d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5091ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
510dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
51149b5e25fSSatish Balay   PetscFunctionReturn(0);
51249b5e25fSSatish Balay }
51349b5e25fSSatish Balay 
514dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
51549b5e25fSSatish Balay {
51649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
517d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
518d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
519d9ca1df4SBarry Smith   const MatScalar   *v;
5206849ba73SBarry Smith   PetscErrorCode    ierr;
521d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
522d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
52398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
52449b5e25fSSatish Balay 
52549b5e25fSSatish Balay   PetscFunctionBegin;
5262dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
527c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
528d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
5291ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
53049b5e25fSSatish Balay 
53149b5e25fSSatish Balay   v  = a->a;
53249b5e25fSSatish Balay   xb = x;
53349b5e25fSSatish Balay 
53449b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
53549b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
53649b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
53749b5e25fSSatish Balay     ib          = aj + *ai;
538831a3094SHong Zhang     jmin        = 0;
53998c9bda7SSatish Balay     nonzerorow += (n>0);
5407fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
54149b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
54249b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
54349b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
54449b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
54549b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
54649b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
547831a3094SHong Zhang       v        += 36; jmin++;
5487fbae186SHong Zhang     }
549444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
550444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
551831a3094SHong Zhang     for (j=jmin; j<n; j++) {
55249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
55349b5e25fSSatish Balay       cval       = ib[j]*6;
55449b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
55549b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
55649b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
55749b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
55849b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
55949b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
56049b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
56149b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
56249b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
56349b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
56449b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
56549b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
56649b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
56749b5e25fSSatish Balay       v        += 36;
56849b5e25fSSatish Balay     }
56949b5e25fSSatish Balay     xb +=6; ai++;
57049b5e25fSSatish Balay   }
57149b5e25fSSatish Balay 
572d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5731ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
574dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
57549b5e25fSSatish Balay   PetscFunctionReturn(0);
57649b5e25fSSatish Balay }
577c2916339SPierre Jolivet 
578dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
57949b5e25fSSatish Balay {
58049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
581d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
582d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
583d9ca1df4SBarry Smith   const MatScalar   *v;
5846849ba73SBarry Smith   PetscErrorCode    ierr;
585d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
586d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
58798c9bda7SSatish Balay   PetscInt          nonzerorow=0;
58849b5e25fSSatish Balay 
58949b5e25fSSatish Balay   PetscFunctionBegin;
5902dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
591c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
592d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
5931ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
59449b5e25fSSatish Balay 
59549b5e25fSSatish Balay   v  = a->a;
59649b5e25fSSatish Balay   xb = x;
59749b5e25fSSatish Balay 
59849b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
59949b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
60049b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
60149b5e25fSSatish Balay     ib          = aj + *ai;
602831a3094SHong Zhang     jmin        = 0;
60398c9bda7SSatish Balay     nonzerorow += (n>0);
6047fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
60549b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
60649b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
60749b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
60849b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
60949b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
61049b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
61149b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
612831a3094SHong Zhang       v        += 49; jmin++;
6137fbae186SHong Zhang     }
614444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
615444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
616831a3094SHong Zhang     for (j=jmin; j<n; j++) {
61749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
61849b5e25fSSatish Balay       cval       = ib[j]*7;
61949b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
62049b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
62149b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
62249b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
62349b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
62449b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
62549b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
62649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
62749b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
62849b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
62949b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
63049b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
63149b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
63249b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
63349b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
63449b5e25fSSatish Balay       v       += 49;
63549b5e25fSSatish Balay     }
63649b5e25fSSatish Balay     xb +=7; ai++;
63749b5e25fSSatish Balay   }
638d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
6391ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
640dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
64149b5e25fSSatish Balay   PetscFunctionReturn(0);
64249b5e25fSSatish Balay }
64349b5e25fSSatish Balay 
64449b5e25fSSatish Balay /*
64549b5e25fSSatish Balay     This will not work with MatScalar == float because it calls the BLAS
64649b5e25fSSatish Balay */
647dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
64849b5e25fSSatish Balay {
64949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
650d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
651d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
652d9ca1df4SBarry Smith   const MatScalar   *v;
653dfbe8321SBarry Smith   PetscErrorCode    ierr;
654d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
655d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
65698c9bda7SSatish Balay   PetscInt          nonzerorow=0;
65749b5e25fSSatish Balay 
65849b5e25fSSatish Balay   PetscFunctionBegin;
6592dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
660c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
66159689ffbSStefano Zampini   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
66259689ffbSStefano Zampini   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
66359689ffbSStefano Zampini 
66459689ffbSStefano Zampini   x_ptr = x;
66559689ffbSStefano Zampini   z_ptr = z;
66649b5e25fSSatish Balay 
66749b5e25fSSatish Balay   aj = a->j;
66849b5e25fSSatish Balay   v  = a->a;
66949b5e25fSSatish Balay   ii = a->i;
67049b5e25fSSatish Balay 
67149b5e25fSSatish Balay   if (!a->mult_work) {
672854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
67349b5e25fSSatish Balay   }
67449b5e25fSSatish Balay   work = a->mult_work;
67549b5e25fSSatish Balay 
67649b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
67749b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
67849b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
67998c9bda7SSatish Balay     nonzerorow += (n>0);
68049b5e25fSSatish Balay 
68149b5e25fSSatish Balay     /* upper triangular part */
68249b5e25fSSatish Balay     for (j=0; j<n; j++) {
68349b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
68449b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
68549b5e25fSSatish Balay       workt += bs;
68649b5e25fSSatish Balay     }
68749b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
68896b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
68949b5e25fSSatish Balay 
69049b5e25fSSatish Balay     /* strict lower triangular part */
691831a3094SHong Zhang     idx = aj+ii[0];
69259689ffbSStefano Zampini     if (n && *idx == i) {
69396b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
694831a3094SHong Zhang     }
69596b9376eSHong Zhang 
69649b5e25fSSatish Balay     if (ncols > 0) {
69749b5e25fSSatish Balay       workt = work;
698580bdb30SBarry Smith       ierr  = PetscArrayzero(workt,ncols);CHKERRQ(ierr);
69996b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
700831a3094SHong Zhang       for (j=0; j<n; j++) {
701831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
70249b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
70349b5e25fSSatish Balay         workt += bs;
70449b5e25fSSatish Balay       }
70549b5e25fSSatish Balay     }
70649b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
70749b5e25fSSatish Balay   }
70849b5e25fSSatish Balay 
709d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
7101ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
711dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
71249b5e25fSSatish Balay   PetscFunctionReturn(0);
71349b5e25fSSatish Balay }
71449b5e25fSSatish Balay 
715dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
71649b5e25fSSatish Balay {
71749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
718d9ca1df4SBarry Smith   PetscScalar       *z,x1;
719d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
720d9ca1df4SBarry Smith   const MatScalar   *v;
7216849ba73SBarry Smith   PetscErrorCode    ierr;
722d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
723d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
72498c9bda7SSatish Balay   PetscInt          nonzerorow=0;
725eb1ec7c1SStefano Zampini #if defined(PETSC_USE_COMPLEX)
726eb1ec7c1SStefano Zampini   const int         aconj = A->hermitian;
727eb1ec7c1SStefano Zampini #else
728eb1ec7c1SStefano Zampini   const int         aconj = 0;
729eb1ec7c1SStefano Zampini #endif
73049b5e25fSSatish Balay 
73149b5e25fSSatish Balay   PetscFunctionBegin;
732343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
733c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
734d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7351ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
73649b5e25fSSatish Balay   v    = a->a;
73749b5e25fSSatish Balay   xb   = x;
73849b5e25fSSatish Balay 
73949b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
74049b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th row of A */
74149b5e25fSSatish Balay     x1          = xb[0];
74249b5e25fSSatish Balay     ib          = aj + *ai;
743831a3094SHong Zhang     jmin        = 0;
74498c9bda7SSatish Balay     nonzerorow += (n>0);
7453d9ade75SStefano Zampini     if (n && *ib == i) {            /* (diag of A)*x */
746831a3094SHong Zhang       z[i] += *v++ * x[*ib++]; jmin++;
747831a3094SHong Zhang     }
748eb1ec7c1SStefano Zampini     if (aconj) {
749eb1ec7c1SStefano Zampini       for (j=jmin; j<n; j++) {
750eb1ec7c1SStefano Zampini         cval    = *ib;
751eb1ec7c1SStefano Zampini         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
752eb1ec7c1SStefano Zampini         z[i]    += *v++ * x[*ib++];    /* (strict upper triangular part of A)*x  */
753eb1ec7c1SStefano Zampini       }
754eb1ec7c1SStefano Zampini     } else {
755831a3094SHong Zhang       for (j=jmin; j<n; j++) {
75649b5e25fSSatish Balay         cval    = *ib;
75749b5e25fSSatish Balay         z[cval] += *v * x1;         /* (strict lower triangular part of A)*x  */
75849b5e25fSSatish Balay         z[i]    += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
75949b5e25fSSatish Balay       }
760eb1ec7c1SStefano Zampini     }
76149b5e25fSSatish Balay     xb++; ai++;
76249b5e25fSSatish Balay   }
76349b5e25fSSatish Balay 
764d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
765bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
76649b5e25fSSatish Balay 
767dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
76849b5e25fSSatish Balay   PetscFunctionReturn(0);
76949b5e25fSSatish Balay }
77049b5e25fSSatish Balay 
771dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
77249b5e25fSSatish Balay {
77349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
774d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2;
775d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
776d9ca1df4SBarry Smith   const MatScalar   *v;
7776849ba73SBarry Smith   PetscErrorCode    ierr;
778d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
779d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
78098c9bda7SSatish Balay   PetscInt          nonzerorow=0;
78149b5e25fSSatish Balay 
78249b5e25fSSatish Balay   PetscFunctionBegin;
783343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
784c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
785d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7861ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
78749b5e25fSSatish Balay 
78849b5e25fSSatish Balay   v  = a->a;
78949b5e25fSSatish Balay   xb = x;
79049b5e25fSSatish Balay 
79149b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
79249b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
79349b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
79449b5e25fSSatish Balay     ib          = aj + *ai;
795831a3094SHong Zhang     jmin        = 0;
79698c9bda7SSatish Balay     nonzerorow += (n>0);
79759689ffbSStefano Zampini     if (n && *ib == i) {      /* (diag of A)*x */
79849b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
79949b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
800831a3094SHong Zhang       v        += 4; jmin++;
8017fbae186SHong Zhang     }
802444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
803444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
804831a3094SHong Zhang     for (j=jmin; j<n; j++) {
80549b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
80649b5e25fSSatish Balay       cval       = ib[j]*2;
80749b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
80849b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
80949b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
81049b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
81149b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
81249b5e25fSSatish Balay       v        += 4;
81349b5e25fSSatish Balay     }
81449b5e25fSSatish Balay     xb +=2; ai++;
81549b5e25fSSatish Balay   }
816d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
817bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
81849b5e25fSSatish Balay 
819dc0b31edSSatish Balay   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
82049b5e25fSSatish Balay   PetscFunctionReturn(0);
82149b5e25fSSatish Balay }
82249b5e25fSSatish Balay 
823dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
82449b5e25fSSatish Balay {
82549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
826d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3;
827d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
828d9ca1df4SBarry Smith   const MatScalar   *v;
8296849ba73SBarry Smith   PetscErrorCode    ierr;
830d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
831d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
83298c9bda7SSatish Balay   PetscInt          nonzerorow=0;
83349b5e25fSSatish Balay 
83449b5e25fSSatish Balay   PetscFunctionBegin;
835343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
836c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
837d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8381ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
83949b5e25fSSatish Balay 
84049b5e25fSSatish Balay   v  = a->a;
84149b5e25fSSatish Balay   xb = x;
84249b5e25fSSatish Balay 
84349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
84449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
84549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
84649b5e25fSSatish Balay     ib          = aj + *ai;
847831a3094SHong Zhang     jmin        = 0;
84898c9bda7SSatish Balay     nonzerorow += (n>0);
84959689ffbSStefano Zampini     if (n && *ib == i) {     /* (diag of A)*x */
85049b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
85149b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
85249b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
853831a3094SHong Zhang       v        += 9; jmin++;
8547fbae186SHong Zhang     }
855444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
856444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
857831a3094SHong Zhang     for (j=jmin; j<n; j++) {
85849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
85949b5e25fSSatish Balay       cval       = ib[j]*3;
86049b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
86149b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
86249b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
86349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
86449b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
86549b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
86649b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
86749b5e25fSSatish Balay       v        += 9;
86849b5e25fSSatish Balay     }
86949b5e25fSSatish Balay     xb +=3; ai++;
87049b5e25fSSatish Balay   }
87149b5e25fSSatish Balay 
872d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
873bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
87449b5e25fSSatish Balay 
875dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
87649b5e25fSSatish Balay   PetscFunctionReturn(0);
87749b5e25fSSatish Balay }
87849b5e25fSSatish Balay 
879dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
88049b5e25fSSatish Balay {
88149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
882d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4;
883d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
884d9ca1df4SBarry Smith   const MatScalar   *v;
8856849ba73SBarry Smith   PetscErrorCode    ierr;
886d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
887d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
88898c9bda7SSatish Balay   PetscInt          nonzerorow=0;
88949b5e25fSSatish Balay 
89049b5e25fSSatish Balay   PetscFunctionBegin;
891343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
892c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
893d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8941ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
89549b5e25fSSatish Balay 
89649b5e25fSSatish Balay   v  = a->a;
89749b5e25fSSatish Balay   xb = x;
89849b5e25fSSatish Balay 
89949b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
90049b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
90149b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
90249b5e25fSSatish Balay     ib          = aj + *ai;
903831a3094SHong Zhang     jmin        = 0;
90498c9bda7SSatish Balay     nonzerorow += (n>0);
90559689ffbSStefano Zampini     if (n && *ib == i) {      /* (diag of A)*x */
90649b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
90749b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
90849b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
90949b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
910831a3094SHong Zhang       v        += 16; jmin++;
9117fbae186SHong Zhang     }
912444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
913444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
914831a3094SHong Zhang     for (j=jmin; j<n; j++) {
91549b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
91649b5e25fSSatish Balay       cval       = ib[j]*4;
91749b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
91849b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
91949b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
92049b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
92149b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
92249b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
92349b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
92449b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
92549b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
92649b5e25fSSatish Balay       v        += 16;
92749b5e25fSSatish Balay     }
92849b5e25fSSatish Balay     xb +=4; ai++;
92949b5e25fSSatish Balay   }
93049b5e25fSSatish Balay 
931d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
932bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
93349b5e25fSSatish Balay 
934dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
93549b5e25fSSatish Balay   PetscFunctionReturn(0);
93649b5e25fSSatish Balay }
93749b5e25fSSatish Balay 
938dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
93949b5e25fSSatish Balay {
94049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
941d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5;
942d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
943d9ca1df4SBarry Smith   const MatScalar   *v;
9446849ba73SBarry Smith   PetscErrorCode    ierr;
945d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
946d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
94798c9bda7SSatish Balay   PetscInt          nonzerorow=0;
94849b5e25fSSatish Balay 
94949b5e25fSSatish Balay   PetscFunctionBegin;
950343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
951c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
952d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
9531ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
95449b5e25fSSatish Balay 
95549b5e25fSSatish Balay   v  = a->a;
95649b5e25fSSatish Balay   xb = x;
95749b5e25fSSatish Balay 
95849b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
95949b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
96049b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
96149b5e25fSSatish Balay     ib          = aj + *ai;
962831a3094SHong Zhang     jmin        = 0;
96398c9bda7SSatish Balay     nonzerorow += (n>0);
96459689ffbSStefano Zampini     if (n && *ib == i) {      /* (diag of A)*x */
96549b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
96649b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
96749b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
96849b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
96949b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
970831a3094SHong Zhang       v        += 25; jmin++;
9717fbae186SHong Zhang     }
972444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
973444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
974831a3094SHong Zhang     for (j=jmin; j<n; j++) {
97549b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
97649b5e25fSSatish Balay       cval       = ib[j]*5;
97749b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
97849b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
97949b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
98049b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
98149b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
98249b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
98349b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
98449b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
98549b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
98649b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
98749b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
98849b5e25fSSatish Balay       v        += 25;
98949b5e25fSSatish Balay     }
99049b5e25fSSatish Balay     xb +=5; ai++;
99149b5e25fSSatish Balay   }
99249b5e25fSSatish Balay 
993d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
994bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
99549b5e25fSSatish Balay 
996dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
99749b5e25fSSatish Balay   PetscFunctionReturn(0);
99849b5e25fSSatish Balay }
999c2916339SPierre Jolivet 
1000dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
100149b5e25fSSatish Balay {
100249b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1003d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
1004d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
1005d9ca1df4SBarry Smith   const MatScalar   *v;
10066849ba73SBarry Smith   PetscErrorCode    ierr;
1007d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1008d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
100998c9bda7SSatish Balay   PetscInt          nonzerorow=0;
101049b5e25fSSatish Balay 
101149b5e25fSSatish Balay   PetscFunctionBegin;
1012343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1013c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
1014d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
10151ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
101649b5e25fSSatish Balay 
101749b5e25fSSatish Balay   v  = a->a;
101849b5e25fSSatish Balay   xb = x;
101949b5e25fSSatish Balay 
102049b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
102149b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
102249b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
102349b5e25fSSatish Balay     ib          = aj + *ai;
1024831a3094SHong Zhang     jmin        = 0;
102598c9bda7SSatish Balay     nonzerorow += (n>0);
102659689ffbSStefano Zampini     if (n && *ib == i) {     /* (diag of A)*x */
102749b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
102849b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
102949b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
103049b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
103149b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
103249b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1033831a3094SHong Zhang       v        += 36; jmin++;
10347fbae186SHong Zhang     }
1035444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1036444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1037831a3094SHong Zhang     for (j=jmin; j<n; j++) {
103849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
103949b5e25fSSatish Balay       cval       = ib[j]*6;
104049b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
104149b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
104249b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
104349b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
104449b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
104549b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
104649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
104749b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
104849b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
104949b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
105049b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
105149b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
105249b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
105349b5e25fSSatish Balay       v        += 36;
105449b5e25fSSatish Balay     }
105549b5e25fSSatish Balay     xb +=6; ai++;
105649b5e25fSSatish Balay   }
105749b5e25fSSatish Balay 
1058d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1059bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
106049b5e25fSSatish Balay 
1061dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
106249b5e25fSSatish Balay   PetscFunctionReturn(0);
106349b5e25fSSatish Balay }
106449b5e25fSSatish Balay 
1065dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
106649b5e25fSSatish Balay {
106749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1068d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1069d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
1070d9ca1df4SBarry Smith   const MatScalar   *v;
10716849ba73SBarry Smith   PetscErrorCode    ierr;
1072d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1073d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
107498c9bda7SSatish Balay   PetscInt          nonzerorow=0;
107549b5e25fSSatish Balay 
107649b5e25fSSatish Balay   PetscFunctionBegin;
1077343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1078c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
1079d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
10801ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
108149b5e25fSSatish Balay 
108249b5e25fSSatish Balay   v  = a->a;
108349b5e25fSSatish Balay   xb = x;
108449b5e25fSSatish Balay 
108549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
108649b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
108749b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
108849b5e25fSSatish Balay     ib          = aj + *ai;
1089831a3094SHong Zhang     jmin        = 0;
109098c9bda7SSatish Balay     nonzerorow += (n>0);
109159689ffbSStefano Zampini     if (n && *ib == i) {     /* (diag of A)*x */
109249b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
109349b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
109449b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
109549b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
109649b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
109749b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
109849b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1099831a3094SHong Zhang       v        += 49; jmin++;
11007fbae186SHong Zhang     }
1101444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1102444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1103831a3094SHong Zhang     for (j=jmin; j<n; j++) {
110449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
110549b5e25fSSatish Balay       cval       = ib[j]*7;
110649b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
110749b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
110849b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
110949b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
111049b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
111149b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
111249b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
111349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
111449b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
111549b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
111649b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
111749b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
111849b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
111949b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
112049b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
112149b5e25fSSatish Balay       v       += 49;
112249b5e25fSSatish Balay     }
112349b5e25fSSatish Balay     xb +=7; ai++;
112449b5e25fSSatish Balay   }
112549b5e25fSSatish Balay 
1126d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1127bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
112849b5e25fSSatish Balay 
1129dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
113049b5e25fSSatish Balay   PetscFunctionReturn(0);
113149b5e25fSSatish Balay }
113249b5e25fSSatish Balay 
1133dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
113449b5e25fSSatish Balay {
113549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1136f4259b30SLisandro Dalcin   PetscScalar       *z,*z_ptr=NULL,*zb,*work,*workt;
1137d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
1138d9ca1df4SBarry Smith   const MatScalar   *v;
1139dfbe8321SBarry Smith   PetscErrorCode    ierr;
1140d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1141d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
114298c9bda7SSatish Balay   PetscInt          nonzerorow=0;
114349b5e25fSSatish Balay 
114449b5e25fSSatish Balay   PetscFunctionBegin;
1145343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1146c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
1147d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
11481ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
114949b5e25fSSatish Balay 
115049b5e25fSSatish Balay   aj = a->j;
115149b5e25fSSatish Balay   v  = a->a;
115249b5e25fSSatish Balay   ii = a->i;
115349b5e25fSSatish Balay 
115449b5e25fSSatish Balay   if (!a->mult_work) {
1155854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
115649b5e25fSSatish Balay   }
115749b5e25fSSatish Balay   work = a->mult_work;
115849b5e25fSSatish Balay 
115949b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
116049b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
116149b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
116298c9bda7SSatish Balay     nonzerorow += (n>0);
116349b5e25fSSatish Balay 
116449b5e25fSSatish Balay     /* upper triangular part */
116549b5e25fSSatish Balay     for (j=0; j<n; j++) {
116649b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
116749b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
116849b5e25fSSatish Balay       workt += bs;
116949b5e25fSSatish Balay     }
117049b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
117196b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
117249b5e25fSSatish Balay 
117349b5e25fSSatish Balay     /* strict lower triangular part */
1174831a3094SHong Zhang     idx = aj+ii[0];
117559689ffbSStefano Zampini     if (n && *idx == i) {
117696b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
1177831a3094SHong Zhang     }
117849b5e25fSSatish Balay     if (ncols > 0) {
117949b5e25fSSatish Balay       workt = work;
1180580bdb30SBarry Smith       ierr  = PetscArrayzero(workt,ncols);CHKERRQ(ierr);
118196b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1182831a3094SHong Zhang       for (j=0; j<n; j++) {
1183831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
118449b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
118549b5e25fSSatish Balay         workt += bs;
118649b5e25fSSatish Balay       }
118749b5e25fSSatish Balay     }
118849b5e25fSSatish Balay 
118949b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
119049b5e25fSSatish Balay   }
119149b5e25fSSatish Balay 
1192d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1193bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
119449b5e25fSSatish Balay 
1195dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
119649b5e25fSSatish Balay   PetscFunctionReturn(0);
119749b5e25fSSatish Balay }
119849b5e25fSSatish Balay 
1199f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
120049b5e25fSSatish Balay {
120149b5e25fSSatish Balay   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1202f4df32b1SMatthew Knepley   PetscScalar    oalpha = alpha;
1203efee365bSSatish Balay   PetscErrorCode ierr;
1204c5df96a5SBarry Smith   PetscBLASInt   one = 1,totalnz;
120549b5e25fSSatish Balay 
120649b5e25fSSatish Balay   PetscFunctionBegin;
1207c5df96a5SBarry Smith   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
12088b83055fSJed Brown   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1209efee365bSSatish Balay   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
121049b5e25fSSatish Balay   PetscFunctionReturn(0);
121149b5e25fSSatish Balay }
121249b5e25fSSatish Balay 
1213dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
121449b5e25fSSatish Balay {
121549b5e25fSSatish Balay   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1216d9ca1df4SBarry Smith   const MatScalar *v       = a->a;
121749b5e25fSSatish Balay   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1218d9ca1df4SBarry Smith   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
12196849ba73SBarry Smith   PetscErrorCode  ierr;
1220d9ca1df4SBarry Smith   const PetscInt  *aj=a->j,*col;
122149b5e25fSSatish Balay 
122249b5e25fSSatish Balay   PetscFunctionBegin;
1223c40ae873SPierre Jolivet   if (!a->nz) {
1224c40ae873SPierre Jolivet     *norm = 0.0;
1225c40ae873SPierre Jolivet     PetscFunctionReturn(0);
1226c40ae873SPierre Jolivet   }
122749b5e25fSSatish Balay   if (type == NORM_FROBENIUS) {
122849b5e25fSSatish Balay     for (k=0; k<mbs; k++) {
122959689ffbSStefano Zampini       jmin = a->i[k];
123059689ffbSStefano Zampini       jmax = a->i[k+1];
1231831a3094SHong Zhang       col  = aj + jmin;
123259689ffbSStefano Zampini       if (jmax-jmin > 0 && *col == k) {         /* diagonal block */
123349b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
123449b5e25fSSatish Balay           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
123549b5e25fSSatish Balay         }
1236831a3094SHong Zhang         jmin++;
1237831a3094SHong Zhang       }
1238831a3094SHong Zhang       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
123949b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
124049b5e25fSSatish Balay           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
124149b5e25fSSatish Balay         }
124249b5e25fSSatish Balay       }
124349b5e25fSSatish Balay     }
12448f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1245ca0c957dSBarry Smith     ierr = PetscLogFlops(2.0*bs2*a->nz);CHKERRQ(ierr);
12460b8dc8d2SHong Zhang   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1247dcca6d9dSJed Brown     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
12480b8dc8d2SHong Zhang     for (i=0; i<mbs; i++) jl[i] = mbs;
12490b8dc8d2SHong Zhang     il[0] = 0;
125049b5e25fSSatish Balay 
125149b5e25fSSatish Balay     *norm = 0.0;
125249b5e25fSSatish Balay     for (k=0; k<mbs; k++) { /* k_th block row */
125349b5e25fSSatish Balay       for (j=0; j<bs; j++) sum[j]=0.0;
125449b5e25fSSatish Balay       /*-- col sum --*/
125549b5e25fSSatish Balay       i = jl[k]; /* first |A(i,k)| to be added */
1256831a3094SHong Zhang       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1257831a3094SHong Zhang                   at step k */
125849b5e25fSSatish Balay       while (i<mbs) {
125949b5e25fSSatish Balay         nexti = jl[i];  /* next block row to be added */
126049b5e25fSSatish Balay         ik    = il[i];  /* block index of A(i,k) in the array a */
126149b5e25fSSatish Balay         for (j=0; j<bs; j++) {
126249b5e25fSSatish Balay           v = a->a + ik*bs2 + j*bs;
126349b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
126449b5e25fSSatish Balay             sum[j] += PetscAbsScalar(*v); v++;
126549b5e25fSSatish Balay           }
126649b5e25fSSatish Balay         }
126749b5e25fSSatish Balay         /* update il, jl */
1268831a3094SHong Zhang         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1269831a3094SHong Zhang         jmax = a->i[i+1];
127049b5e25fSSatish Balay         if (jmin < jmax) {
127149b5e25fSSatish Balay           il[i] = jmin;
127249b5e25fSSatish Balay           j     = a->j[jmin];
127349b5e25fSSatish Balay           jl[i] = jl[j]; jl[j]=i;
127449b5e25fSSatish Balay         }
127549b5e25fSSatish Balay         i = nexti;
127649b5e25fSSatish Balay       }
127749b5e25fSSatish Balay       /*-- row sum --*/
127859689ffbSStefano Zampini       jmin = a->i[k];
127959689ffbSStefano Zampini       jmax = a->i[k+1];
128049b5e25fSSatish Balay       for (i=jmin; i<jmax; i++) {
128149b5e25fSSatish Balay         for (j=0; j<bs; j++) {
128249b5e25fSSatish Balay           v = a->a + i*bs2 + j;
128349b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
12840b8dc8d2SHong Zhang             sum[j] += PetscAbsScalar(*v); v += bs;
128549b5e25fSSatish Balay           }
128649b5e25fSSatish Balay         }
128749b5e25fSSatish Balay       }
128849b5e25fSSatish Balay       /* add k_th block row to il, jl */
1289831a3094SHong Zhang       col = aj+jmin;
129059689ffbSStefano Zampini       if (jmax - jmin > 0 && *col == k) jmin++;
129149b5e25fSSatish Balay       if (jmin < jmax) {
129249b5e25fSSatish Balay         il[k] = jmin;
12930b8dc8d2SHong Zhang         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
129449b5e25fSSatish Balay       }
129549b5e25fSSatish Balay       for (j=0; j<bs; j++) {
129649b5e25fSSatish Balay         if (sum[j] > *norm) *norm = sum[j];
129749b5e25fSSatish Balay       }
129849b5e25fSSatish Balay     }
129974ed9c26SBarry Smith     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
130051f70360SJed Brown     ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr);
1301f23aa3ddSBarry Smith   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
130249b5e25fSSatish Balay   PetscFunctionReturn(0);
130349b5e25fSSatish Balay }
130449b5e25fSSatish Balay 
1305ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
130649b5e25fSSatish Balay {
130749b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1308dfbe8321SBarry Smith   PetscErrorCode ierr;
130949b5e25fSSatish Balay 
131049b5e25fSSatish Balay   PetscFunctionBegin;
131149b5e25fSSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1312d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1313ef511fbeSHong Zhang     *flg = PETSC_FALSE;
1314ef511fbeSHong Zhang     PetscFunctionReturn(0);
131549b5e25fSSatish Balay   }
131649b5e25fSSatish Balay 
131749b5e25fSSatish Balay   /* if the a->i are the same */
1318580bdb30SBarry Smith   ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr);
131926fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
132049b5e25fSSatish Balay 
132149b5e25fSSatish Balay   /* if a->j are the same */
1322580bdb30SBarry Smith   ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr);
132326fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
132426fbe8dcSKarl Rupp 
132549b5e25fSSatish Balay   /* if a->a are the same */
1326580bdb30SBarry Smith   ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg);CHKERRQ(ierr);
1327935af2e7SHong Zhang   PetscFunctionReturn(0);
132849b5e25fSSatish Balay }
132949b5e25fSSatish Balay 
1330dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
133149b5e25fSSatish Balay {
133249b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1333dfbe8321SBarry Smith   PetscErrorCode  ierr;
1334d9ca1df4SBarry Smith   PetscInt        i,j,k,row,bs,ambs,bs2;
1335d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
133687828ca2SBarry Smith   PetscScalar     *x,zero = 0.0;
1337d9ca1df4SBarry Smith   const MatScalar *aa,*aa_j;
133849b5e25fSSatish Balay 
133949b5e25fSSatish Balay   PetscFunctionBegin;
1340d0f46423SBarry Smith   bs = A->rmap->bs;
1341e32f2f54SBarry Smith   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
134282799104SHong Zhang 
134349b5e25fSSatish Balay   aa   = a->a;
13448a0c6efdSHong Zhang   ambs = a->mbs;
13458a0c6efdSHong Zhang 
13468a0c6efdSHong Zhang   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
13478a0c6efdSHong Zhang     PetscInt *diag=a->diag;
13488a0c6efdSHong Zhang     aa   = a->a;
13498a0c6efdSHong Zhang     ambs = a->mbs;
13508a0c6efdSHong Zhang     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
13518a0c6efdSHong Zhang     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
13528a0c6efdSHong Zhang     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
13538a0c6efdSHong Zhang     PetscFunctionReturn(0);
13548a0c6efdSHong Zhang   }
13558a0c6efdSHong Zhang 
135649b5e25fSSatish Balay   ai   = a->i;
135749b5e25fSSatish Balay   aj   = a->j;
135849b5e25fSSatish Balay   bs2  = a->bs2;
13592dcb1b2aSMatthew Knepley   ierr = VecSet(v,zero);CHKERRQ(ierr);
1360c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
13611ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
136249b5e25fSSatish Balay   for (i=0; i<ambs; i++) {
136349b5e25fSSatish Balay     j = ai[i];
136449b5e25fSSatish Balay     if (aj[j] == i) {    /* if this is a diagonal element */
136549b5e25fSSatish Balay       row  = i*bs;
136649b5e25fSSatish Balay       aa_j = aa + j*bs2;
136749b5e25fSSatish Balay       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
136849b5e25fSSatish Balay     }
136949b5e25fSSatish Balay   }
13701ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
137149b5e25fSSatish Balay   PetscFunctionReturn(0);
137249b5e25fSSatish Balay }
137349b5e25fSSatish Balay 
1374dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
137549b5e25fSSatish Balay {
137649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1377d9ca1df4SBarry Smith   PetscScalar       x;
1378d9ca1df4SBarry Smith   const PetscScalar *l,*li,*ri;
137949b5e25fSSatish Balay   MatScalar         *aa,*v;
1380dfbe8321SBarry Smith   PetscErrorCode    ierr;
1381fff8e43fSBarry Smith   PetscInt          i,j,k,lm,M,m,mbs,tmp,bs,bs2;
1382fff8e43fSBarry Smith   const PetscInt    *ai,*aj;
1383ace3abfcSBarry Smith   PetscBool         flg;
138449b5e25fSSatish Balay 
138549b5e25fSSatish Balay   PetscFunctionBegin;
1386b3bf805bSHong Zhang   if (ll != rr) {
1387b3bf805bSHong Zhang     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1388e7e72b3dSBarry Smith     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1389b3bf805bSHong Zhang   }
1390b3bf805bSHong Zhang   if (!ll) PetscFunctionReturn(0);
139149b5e25fSSatish Balay   ai  = a->i;
139249b5e25fSSatish Balay   aj  = a->j;
139349b5e25fSSatish Balay   aa  = a->a;
1394d0f46423SBarry Smith   m   = A->rmap->N;
1395d0f46423SBarry Smith   bs  = A->rmap->bs;
139649b5e25fSSatish Balay   mbs = a->mbs;
139749b5e25fSSatish Balay   bs2 = a->bs2;
139849b5e25fSSatish Balay 
1399d9ca1df4SBarry Smith   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
140049b5e25fSSatish Balay   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1401e32f2f54SBarry Smith   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
140249b5e25fSSatish Balay   for (i=0; i<mbs; i++) { /* for each block row */
140349b5e25fSSatish Balay     M  = ai[i+1] - ai[i];
140449b5e25fSSatish Balay     li = l + i*bs;
140549b5e25fSSatish Balay     v  = aa + bs2*ai[i];
140649b5e25fSSatish Balay     for (j=0; j<M; j++) { /* for each block */
140749b5e25fSSatish Balay       ri = l + bs*aj[ai[i]+j];
14085e90f9d9SHong Zhang       for (k=0; k<bs; k++) {
140949b5e25fSSatish Balay         x = ri[k];
141049b5e25fSSatish Balay         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
141149b5e25fSSatish Balay       }
141249b5e25fSSatish Balay     }
141349b5e25fSSatish Balay   }
1414d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1415dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
141649b5e25fSSatish Balay   PetscFunctionReturn(0);
141749b5e25fSSatish Balay }
141849b5e25fSSatish Balay 
1419dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
142049b5e25fSSatish Balay {
142149b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
142249b5e25fSSatish Balay 
142349b5e25fSSatish Balay   PetscFunctionBegin;
142449b5e25fSSatish Balay   info->block_size   = a->bs2;
1425ceed8ce5SJed Brown   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
14266c6c5352SBarry Smith   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
14273966268fSBarry Smith   info->nz_unneeded  = info->nz_allocated - info->nz_used;
142849b5e25fSSatish Balay   info->assemblies   = A->num_ass;
14298e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
14307adad957SLisandro Dalcin   info->memory       = ((PetscObject)A)->mem;
1431d5f3da31SBarry Smith   if (A->factortype) {
143249b5e25fSSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
143349b5e25fSSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
143449b5e25fSSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
143549b5e25fSSatish Balay   } else {
143649b5e25fSSatish Balay     info->fill_ratio_given  = 0;
143749b5e25fSSatish Balay     info->fill_ratio_needed = 0;
143849b5e25fSSatish Balay     info->factor_mallocs    = 0;
143949b5e25fSSatish Balay   }
144049b5e25fSSatish Balay   PetscFunctionReturn(0);
144149b5e25fSSatish Balay }
144249b5e25fSSatish Balay 
1443dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
144449b5e25fSSatish Balay {
144549b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1446dfbe8321SBarry Smith   PetscErrorCode ierr;
144749b5e25fSSatish Balay 
144849b5e25fSSatish Balay   PetscFunctionBegin;
1449580bdb30SBarry Smith   ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr);
145049b5e25fSSatish Balay   PetscFunctionReturn(0);
145149b5e25fSSatish Balay }
1452dc354874SHong Zhang 
1453985db425SBarry Smith PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1454dc354874SHong Zhang {
1455dc354874SHong Zhang   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1456dfbe8321SBarry Smith   PetscErrorCode  ierr;
1457d9ca1df4SBarry Smith   PetscInt        i,j,n,row,col,bs,mbs;
1458d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
1459c3fca9a7SHong Zhang   PetscReal       atmp;
1460d9ca1df4SBarry Smith   const MatScalar *aa;
1461985db425SBarry Smith   PetscScalar     *x;
146213f74950SBarry Smith   PetscInt        ncols,brow,bcol,krow,kcol;
1463dc354874SHong Zhang 
1464dc354874SHong Zhang   PetscFunctionBegin;
1465e32f2f54SBarry Smith   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1466e32f2f54SBarry Smith   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1467d0f46423SBarry Smith   bs  = A->rmap->bs;
1468dc354874SHong Zhang   aa  = a->a;
1469dc354874SHong Zhang   ai  = a->i;
1470dc354874SHong Zhang   aj  = a->j;
147144117c81SHong Zhang   mbs = a->mbs;
1472dc354874SHong Zhang 
1473985db425SBarry Smith   ierr = VecSet(v,0.0);CHKERRQ(ierr);
14741ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1475dc354874SHong Zhang   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1476e32f2f54SBarry Smith   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
147744117c81SHong Zhang   for (i=0; i<mbs; i++) {
1478d0f6400bSHong Zhang     ncols = ai[1] - ai[0]; ai++;
1479d0f6400bSHong Zhang     brow  = bs*i;
148044117c81SHong Zhang     for (j=0; j<ncols; j++) {
1481d0f6400bSHong Zhang       bcol = bs*(*aj);
148244117c81SHong Zhang       for (kcol=0; kcol<bs; kcol++) {
1483d0f6400bSHong Zhang         col = bcol + kcol;      /* col index */
148444117c81SHong Zhang         for (krow=0; krow<bs; krow++) {
1485d0f6400bSHong Zhang           atmp = PetscAbsScalar(*aa); aa++;
1486d0f6400bSHong Zhang           row  = brow + krow;   /* row index */
1487c3fca9a7SHong Zhang           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1488c3fca9a7SHong Zhang           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
148944117c81SHong Zhang         }
149044117c81SHong Zhang       }
1491d0f6400bSHong Zhang       aj++;
1492dc354874SHong Zhang     }
1493dc354874SHong Zhang   }
14941ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1495dc354874SHong Zhang   PetscFunctionReturn(0);
1496dc354874SHong Zhang }
1497c2916339SPierre Jolivet 
14984222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1499c2916339SPierre Jolivet {
1500c2916339SPierre Jolivet   PetscErrorCode ierr;
1501c2916339SPierre Jolivet 
1502c2916339SPierre Jolivet   PetscFunctionBegin;
1503c2916339SPierre Jolivet   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
15044222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1505c2916339SPierre Jolivet   PetscFunctionReturn(0);
1506c2916339SPierre Jolivet }
1507c2916339SPierre Jolivet 
1508c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1509c2916339SPierre Jolivet {
1510c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1511c2916339SPierre Jolivet   PetscScalar       *z = c;
1512c2916339SPierre Jolivet   const PetscScalar *xb;
1513c2916339SPierre Jolivet   PetscScalar       x1;
1514c2916339SPierre Jolivet   const MatScalar   *v = a->a,*vv;
1515c2916339SPierre Jolivet   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
15163c2e41e1SStefano Zampini #if defined(PETSC_USE_COMPLEX)
15173c2e41e1SStefano Zampini   const int         aconj = A->hermitian;
15183c2e41e1SStefano Zampini #else
15193c2e41e1SStefano Zampini   const int         aconj = 0;
15203c2e41e1SStefano Zampini #endif
1521c2916339SPierre Jolivet 
1522c2916339SPierre Jolivet   PetscFunctionBegin;
1523c2916339SPierre Jolivet   for (i=0; i<mbs; i++) {
1524c2916339SPierre Jolivet     n = ii[1] - ii[0]; ii++;
1525c2916339SPierre Jolivet     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1526c2916339SPierre Jolivet     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1527c2916339SPierre Jolivet     jj = idx;
1528c2916339SPierre Jolivet     vv = v;
1529c2916339SPierre Jolivet     for (k=0; k<cn; k++) {
1530c2916339SPierre Jolivet       idx = jj;
1531c2916339SPierre Jolivet       v = vv;
1532c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1533c2916339SPierre Jolivet         xb = b + (*idx); x1 = xb[0+k*bm];
1534c2916339SPierre Jolivet         z[0+k*cm] += v[0]*x1;
15353c2e41e1SStefano Zampini         if (*idx != i) c[(*idx)+k*cm] += (aconj ? PetscConj(v[0]) : v[0])*b[i+k*bm];
1536c2916339SPierre Jolivet         v += 1;
1537c2916339SPierre Jolivet         ++idx;
1538c2916339SPierre Jolivet       }
1539c2916339SPierre Jolivet     }
1540c2916339SPierre Jolivet     z += 1;
1541c2916339SPierre Jolivet   }
1542c2916339SPierre Jolivet   PetscFunctionReturn(0);
1543c2916339SPierre Jolivet }
1544c2916339SPierre Jolivet 
1545c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1546c2916339SPierre Jolivet {
1547c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1548c2916339SPierre Jolivet   PetscScalar       *z = c;
1549c2916339SPierre Jolivet   const PetscScalar *xb;
1550c2916339SPierre Jolivet   PetscScalar       x1,x2;
1551c2916339SPierre Jolivet   const MatScalar   *v = a->a,*vv;
1552c2916339SPierre Jolivet   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1553c2916339SPierre Jolivet 
1554c2916339SPierre Jolivet   PetscFunctionBegin;
1555c2916339SPierre Jolivet   for (i=0; i<mbs; i++) {
1556c2916339SPierre Jolivet     n = ii[1] - ii[0]; ii++;
1557c2916339SPierre Jolivet     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1558c2916339SPierre Jolivet     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1559c2916339SPierre Jolivet     jj = idx;
1560c2916339SPierre Jolivet     vv = v;
1561c2916339SPierre Jolivet     for (k=0; k<cn; k++) {
1562c2916339SPierre Jolivet       idx = jj;
1563c2916339SPierre Jolivet       v = vv;
1564c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1565c2916339SPierre Jolivet         xb    = b + 2*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
1566c2916339SPierre Jolivet         z[0+k*cm] += v[0]*x1 + v[2]*x2;
1567c2916339SPierre Jolivet         z[1+k*cm] += v[1]*x1 + v[3]*x2;
1568c2916339SPierre Jolivet         if (*idx != i) {
1569c2916339SPierre Jolivet           c[2*(*idx)+0+k*cm] += v[0]*b[2*i+k*bm] + v[1]*b[2*i+1+k*bm];
1570c2916339SPierre Jolivet           c[2*(*idx)+1+k*cm] += v[2]*b[2*i+k*bm] + v[3]*b[2*i+1+k*bm];
1571c2916339SPierre Jolivet         }
1572c2916339SPierre Jolivet         v += 4;
1573c2916339SPierre Jolivet         ++idx;
1574c2916339SPierre Jolivet       }
1575c2916339SPierre Jolivet     }
1576c2916339SPierre Jolivet     z += 2;
1577c2916339SPierre Jolivet   }
1578c2916339SPierre Jolivet   PetscFunctionReturn(0);
1579c2916339SPierre Jolivet }
1580c2916339SPierre Jolivet 
1581c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1582c2916339SPierre Jolivet {
1583c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1584c2916339SPierre Jolivet   PetscScalar       *z = c;
1585c2916339SPierre Jolivet   const PetscScalar *xb;
1586c2916339SPierre Jolivet   PetscScalar       x1,x2,x3;
1587c2916339SPierre Jolivet   const MatScalar   *v = a->a,*vv;
1588c2916339SPierre Jolivet   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1589c2916339SPierre Jolivet 
1590c2916339SPierre Jolivet   PetscFunctionBegin;
1591c2916339SPierre Jolivet   for (i=0; i<mbs; i++) {
1592c2916339SPierre Jolivet     n = ii[1] - ii[0]; ii++;
1593c2916339SPierre Jolivet     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1594c2916339SPierre Jolivet     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1595c2916339SPierre Jolivet     jj = idx;
1596c2916339SPierre Jolivet     vv = v;
1597c2916339SPierre Jolivet     for (k=0; k<cn; k++) {
1598c2916339SPierre Jolivet       idx = jj;
1599c2916339SPierre Jolivet       v = vv;
1600c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1601c2916339SPierre Jolivet         xb = b + 3*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
1602c2916339SPierre Jolivet         z[0+k*cm] += v[0]*x1 + v[3]*x2 + v[6]*x3;
1603c2916339SPierre Jolivet         z[1+k*cm] += v[1]*x1 + v[4]*x2 + v[7]*x3;
1604c2916339SPierre Jolivet         z[2+k*cm] += v[2]*x1 + v[5]*x2 + v[8]*x3;
1605c2916339SPierre Jolivet         if (*idx != i) {
1606c2916339SPierre Jolivet           c[3*(*idx)+0+k*cm] += v[0]*b[3*i+k*bm] + v[3]*b[3*i+1+k*bm] + v[6]*b[3*i+2+k*bm];
1607c2916339SPierre Jolivet           c[3*(*idx)+1+k*cm] += v[1]*b[3*i+k*bm] + v[4]*b[3*i+1+k*bm] + v[7]*b[3*i+2+k*bm];
1608c2916339SPierre Jolivet           c[3*(*idx)+2+k*cm] += v[2]*b[3*i+k*bm] + v[5]*b[3*i+1+k*bm] + v[8]*b[3*i+2+k*bm];
1609c2916339SPierre Jolivet         }
1610c2916339SPierre Jolivet         v += 9;
1611c2916339SPierre Jolivet         ++idx;
1612c2916339SPierre Jolivet       }
1613c2916339SPierre Jolivet     }
1614c2916339SPierre Jolivet     z += 3;
1615c2916339SPierre Jolivet   }
1616c2916339SPierre Jolivet   PetscFunctionReturn(0);
1617c2916339SPierre Jolivet }
1618c2916339SPierre Jolivet 
1619c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1620c2916339SPierre Jolivet {
1621c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1622c2916339SPierre Jolivet   PetscScalar       *z = c;
1623c2916339SPierre Jolivet   const PetscScalar *xb;
1624c2916339SPierre Jolivet   PetscScalar       x1,x2,x3,x4;
1625c2916339SPierre Jolivet   const MatScalar   *v = a->a,*vv;
1626c2916339SPierre Jolivet   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1627c2916339SPierre Jolivet 
1628c2916339SPierre Jolivet   PetscFunctionBegin;
1629c2916339SPierre Jolivet   for (i=0; i<mbs; i++) {
1630c2916339SPierre Jolivet     n = ii[1] - ii[0]; ii++;
1631c2916339SPierre Jolivet     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1632c2916339SPierre Jolivet     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1633c2916339SPierre Jolivet     jj = idx;
1634c2916339SPierre Jolivet     vv = v;
1635c2916339SPierre Jolivet     for (k=0; k<cn; k++) {
1636c2916339SPierre Jolivet       idx = jj;
1637c2916339SPierre Jolivet       v = vv;
1638c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1639c2916339SPierre Jolivet         xb = b + 4*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
1640c2916339SPierre Jolivet         z[0+k*cm] += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1641c2916339SPierre Jolivet         z[1+k*cm] += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1642c2916339SPierre Jolivet         z[2+k*cm] += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1643c2916339SPierre Jolivet         z[3+k*cm] += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1644c2916339SPierre Jolivet         if (*idx != i) {
1645c2916339SPierre Jolivet           c[4*(*idx)+0+k*cm] += v[0]*b[4*i+k*bm] + v[4]*b[4*i+1+k*bm] + v[8]*b[4*i+2+k*bm]  + v[12]*b[4*i+3+k*bm];
1646c2916339SPierre Jolivet           c[4*(*idx)+1+k*cm] += v[1]*b[4*i+k*bm] + v[5]*b[4*i+1+k*bm] + v[9]*b[4*i+2+k*bm]  + v[13]*b[4*i+3+k*bm];
1647c2916339SPierre Jolivet           c[4*(*idx)+2+k*cm] += v[2]*b[4*i+k*bm] + v[6]*b[4*i+1+k*bm] + v[10]*b[4*i+2+k*bm] + v[14]*b[4*i+3+k*bm];
1648c2916339SPierre Jolivet           c[4*(*idx)+3+k*cm] += v[3]*b[4*i+k*bm] + v[7]*b[4*i+1+k*bm] + v[11]*b[4*i+2+k*bm] + v[15]*b[4*i+3+k*bm];
1649c2916339SPierre Jolivet         }
1650c2916339SPierre Jolivet         v += 16;
1651c2916339SPierre Jolivet         ++idx;
1652c2916339SPierre Jolivet       }
1653c2916339SPierre Jolivet     }
1654c2916339SPierre Jolivet     z += 4;
1655c2916339SPierre Jolivet   }
1656c2916339SPierre Jolivet   PetscFunctionReturn(0);
1657c2916339SPierre Jolivet }
1658c2916339SPierre Jolivet 
1659c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1660c2916339SPierre Jolivet {
1661c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1662c2916339SPierre Jolivet   PetscScalar       *z = c;
1663c2916339SPierre Jolivet   const PetscScalar *xb;
1664c2916339SPierre Jolivet   PetscScalar       x1,x2,x3,x4,x5;
1665c2916339SPierre Jolivet   const MatScalar   *v = a->a,*vv;
1666c2916339SPierre Jolivet   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1667c2916339SPierre Jolivet 
1668c2916339SPierre Jolivet   PetscFunctionBegin;
1669c2916339SPierre Jolivet   for (i=0; i<mbs; i++) {
1670c2916339SPierre Jolivet     n = ii[1] - ii[0]; ii++;
1671c2916339SPierre Jolivet     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1672c2916339SPierre Jolivet     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1673c2916339SPierre Jolivet     jj = idx;
1674c2916339SPierre Jolivet     vv = v;
1675c2916339SPierre Jolivet     for (k=0; k<cn; k++) {
1676c2916339SPierre Jolivet       idx = jj;
1677c2916339SPierre Jolivet       v = vv;
1678c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1679c2916339SPierre Jolivet         xb = b + 5*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*cm];
1680c2916339SPierre Jolivet         z[0+k*cm] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1681c2916339SPierre Jolivet         z[1+k*cm] += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1682c2916339SPierre Jolivet         z[2+k*cm] += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1683c2916339SPierre Jolivet         z[3+k*cm] += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1684c2916339SPierre Jolivet         z[4+k*cm] += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1685c2916339SPierre Jolivet         if (*idx != i) {
1686c2916339SPierre Jolivet           c[5*(*idx)+0+k*cm] += v[0]*b[5*i+k*bm] + v[5]*b[5*i+1+k*bm] + v[10]*b[5*i+2+k*bm] + v[15]*b[5*i+3+k*bm] + v[20]*b[5*i+4+k*bm];
1687c2916339SPierre Jolivet           c[5*(*idx)+1+k*cm] += v[1]*b[5*i+k*bm] + v[6]*b[5*i+1+k*bm] + v[11]*b[5*i+2+k*bm] + v[16]*b[5*i+3+k*bm] + v[21]*b[5*i+4+k*bm];
1688c2916339SPierre Jolivet           c[5*(*idx)+2+k*cm] += v[2]*b[5*i+k*bm] + v[7]*b[5*i+1+k*bm] + v[12]*b[5*i+2+k*bm] + v[17]*b[5*i+3+k*bm] + v[22]*b[5*i+4+k*bm];
1689c2916339SPierre Jolivet           c[5*(*idx)+3+k*cm] += v[3]*b[5*i+k*bm] + v[8]*b[5*i+1+k*bm] + v[13]*b[5*i+2+k*bm] + v[18]*b[5*i+3+k*bm] + v[23]*b[5*i+4+k*bm];
1690c2916339SPierre Jolivet           c[5*(*idx)+4+k*cm] += v[4]*b[5*i+k*bm] + v[9]*b[5*i+1+k*bm] + v[14]*b[5*i+2+k*bm] + v[19]*b[5*i+3+k*bm] + v[24]*b[5*i+4+k*bm];
1691c2916339SPierre Jolivet         }
1692c2916339SPierre Jolivet         v += 25;
1693c2916339SPierre Jolivet         ++idx;
1694c2916339SPierre Jolivet       }
1695c2916339SPierre Jolivet     }
1696c2916339SPierre Jolivet     z += 5;
1697c2916339SPierre Jolivet   }
1698c2916339SPierre Jolivet   PetscFunctionReturn(0);
1699c2916339SPierre Jolivet }
1700c2916339SPierre Jolivet 
1701c2916339SPierre Jolivet PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C)
1702c2916339SPierre Jolivet {
1703c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1704c2916339SPierre Jolivet   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
1705281439baSStefano Zampini   Mat_SeqDense      *cd = (Mat_SeqDense*)C->data;
1706c2916339SPierre Jolivet   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
1707c2916339SPierre Jolivet   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1708c2916339SPierre Jolivet   PetscBLASInt      bbs,bcn,bbm,bcm;
1709f4259b30SLisandro Dalcin   PetscScalar       *z = NULL;
1710c2916339SPierre Jolivet   PetscScalar       *c,*b;
1711c2916339SPierre Jolivet   const MatScalar   *v;
1712c2916339SPierre Jolivet   const PetscInt    *idx,*ii;
1713c2916339SPierre Jolivet   PetscScalar       _DOne=1.0;
1714c2916339SPierre Jolivet   PetscErrorCode    ierr;
1715c2916339SPierre Jolivet 
1716c2916339SPierre Jolivet   PetscFunctionBegin;
1717c2916339SPierre Jolivet   if (!cm || !cn) PetscFunctionReturn(0);
1718c2916339SPierre Jolivet   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
1719c2916339SPierre Jolivet   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1720c2916339SPierre Jolivet   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1721c2916339SPierre Jolivet   b = bd->v;
1722c2916339SPierre Jolivet   ierr = MatZeroEntries(C);CHKERRQ(ierr);
1723c2916339SPierre Jolivet   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1724c2916339SPierre Jolivet   switch (bs) {
1725c2916339SPierre Jolivet   case 1:
1726*1e1ea65dSPierre Jolivet     ierr = MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr);
1727c2916339SPierre Jolivet     break;
1728c2916339SPierre Jolivet   case 2:
1729*1e1ea65dSPierre Jolivet     ierr = MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr);
1730c2916339SPierre Jolivet     break;
1731c2916339SPierre Jolivet   case 3:
1732*1e1ea65dSPierre Jolivet     ierr = MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr);
1733c2916339SPierre Jolivet     break;
1734c2916339SPierre Jolivet   case 4:
1735*1e1ea65dSPierre Jolivet     ierr = MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr);
1736c2916339SPierre Jolivet     break;
1737c2916339SPierre Jolivet   case 5:
1738*1e1ea65dSPierre Jolivet     ierr = MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr);
1739c2916339SPierre Jolivet     break;
1740c2916339SPierre Jolivet   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1741c2916339SPierre Jolivet     ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr);
1742c2916339SPierre Jolivet     ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr);
1743c2916339SPierre Jolivet     ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr);
1744c2916339SPierre Jolivet     ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr);
1745c2916339SPierre Jolivet     idx = a->j;
1746c2916339SPierre Jolivet     v   = a->a;
1747c2916339SPierre Jolivet     mbs = a->mbs;
1748c2916339SPierre Jolivet     ii  = a->i;
1749c2916339SPierre Jolivet     z   = c;
1750c2916339SPierre Jolivet     for (i=0; i<mbs; i++) {
1751c2916339SPierre Jolivet       n = ii[1] - ii[0]; ii++;
1752c2916339SPierre Jolivet       for (j=0; j<n; j++) {
17536718818eSStefano Zampini         if (*idx != i) PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*i,&bbm,&_DOne,c+bs*(*idx),&bcm));
1754c2916339SPierre Jolivet         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
1755c2916339SPierre Jolivet         v += bs2;
1756c2916339SPierre Jolivet       }
1757c2916339SPierre Jolivet       z += bs;
1758c2916339SPierre Jolivet     }
1759c2916339SPierre Jolivet   }
1760c2916339SPierre Jolivet   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1761c2916339SPierre Jolivet   ierr = PetscLogFlops((2.0*(a->nz*2.0 - a->nonzerorowcnt)*bs2 - a->nonzerorowcnt)*cn);CHKERRQ(ierr);
1762c2916339SPierre Jolivet   PetscFunctionReturn(0);
1763c2916339SPierre Jolivet }
1764