xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision c2916339d3c5437df97526bab4e3cf11b3acd91c)
149b5e25fSSatish Balay 
2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
3*c2916339SPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h>
4*c2916339SPierre Jolivet #include <../src/mat/impls/sbaij/seq/sbaij.h>
5af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
6c6db04a5SJed Brown #include <petscbt.h>
7c6db04a5SJed Brown #include <petscblaslapack.h>
849b5e25fSSatish Balay 
913f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
1049b5e25fSSatish Balay {
115eee224dSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
126849ba73SBarry Smith   PetscErrorCode ierr;
135d0c19d7SBarry Smith   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
145d0c19d7SBarry Smith   const PetscInt *idx;
15db41cccfSHong Zhang   PetscBT        table_out,table_in;
16d94109b8SHong Zhang 
17d94109b8SHong Zhang   PetscFunctionBegin;
18e32f2f54SBarry Smith   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
19d94109b8SHong Zhang   mbs  = a->mbs;
20d94109b8SHong Zhang   ai   = a->i;
21d94109b8SHong Zhang   aj   = a->j;
22d0f46423SBarry Smith   bs   = A->rmap->bs;
2353b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
24854ce69bSBarry Smith   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
25854ce69bSBarry Smith   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
2653b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
27d94109b8SHong Zhang 
28d94109b8SHong Zhang   for (i=0; i<is_max; i++) { /* for each is */
29d94109b8SHong Zhang     isz  = 0;
30db41cccfSHong Zhang     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
31d94109b8SHong Zhang 
32d94109b8SHong Zhang     /* Extract the indices, assume there can be duplicate entries */
33d94109b8SHong Zhang     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
34d94109b8SHong Zhang     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
35d94109b8SHong Zhang 
36db41cccfSHong Zhang     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
37dbe03f88SHong Zhang     bcol_max = 0;
38d94109b8SHong Zhang     for (j=0; j<n; ++j) {
39d94109b8SHong Zhang       brow = idx[j]/bs; /* convert the indices into block indices */
40e32f2f54SBarry Smith       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
41db41cccfSHong Zhang       if (!PetscBTLookupSet(table_out,brow)) {
42dbe03f88SHong Zhang         nidx[isz++] = brow;
43dbe03f88SHong Zhang         if (bcol_max < brow) bcol_max = brow;
44dbe03f88SHong Zhang       }
45d94109b8SHong Zhang     }
46d94109b8SHong Zhang     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
476bf464f9SBarry Smith     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
48d94109b8SHong Zhang 
49d94109b8SHong Zhang     k = 0;
50d94109b8SHong Zhang     for (j=0; j<ov; j++) { /* for each overlap */
51db41cccfSHong Zhang       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
52db41cccfSHong Zhang       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
53db41cccfSHong Zhang       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
54d94109b8SHong Zhang 
55d94109b8SHong Zhang       n = isz;  /* length of the updated is[i] */
56d94109b8SHong Zhang       for (brow=0; brow<mbs; brow++) {
57d94109b8SHong Zhang         start = ai[brow]; end   = ai[brow+1];
58db41cccfSHong Zhang         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
59d94109b8SHong Zhang           for (l = start; l<end; l++) {
60d94109b8SHong Zhang             bcol = aj[l];
61d7b97159SHong Zhang             if (!PetscBTLookupSet(table_out,bcol)) {
62d7b97159SHong Zhang               nidx[isz++] = bcol;
63d7b97159SHong Zhang               if (bcol_max < bcol) bcol_max = bcol;
64d7b97159SHong Zhang             }
65d94109b8SHong Zhang           }
66d94109b8SHong Zhang           k++;
67d94109b8SHong Zhang           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
68d94109b8SHong Zhang         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
69d94109b8SHong Zhang           for (l = start; l<end; l++) {
70d94109b8SHong Zhang             bcol = aj[l];
71dbe03f88SHong Zhang             if (bcol > bcol_max) break;
72db41cccfSHong Zhang             if (PetscBTLookup(table_in,bcol)) {
7326fbe8dcSKarl Rupp               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
74d94109b8SHong Zhang               break; /* for l = start; l<end ; l++) */
75d94109b8SHong Zhang             }
76d94109b8SHong Zhang           }
77d94109b8SHong Zhang         }
78d94109b8SHong Zhang       }
79d94109b8SHong Zhang     } /* for each overlap */
80d94109b8SHong Zhang 
81d94109b8SHong Zhang     /* expand the Index Set */
82d94109b8SHong Zhang     for (j=0; j<isz; j++) {
8326fbe8dcSKarl Rupp       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
84d94109b8SHong Zhang     }
8570b3c8c7SBarry Smith     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
86d94109b8SHong Zhang   }
8794bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
88d94109b8SHong Zhang   ierr = PetscFree(nidx);CHKERRQ(ierr);
89d94109b8SHong Zhang   ierr = PetscFree(nidx2);CHKERRQ(ierr);
9094bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
915eee224dSHong Zhang   PetscFunctionReturn(0);
9249b5e25fSSatish Balay }
9349b5e25fSSatish Balay 
94847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
95847daeccSHong Zhang         Zero some ops' to avoid invalid usse */
96847daeccSHong Zhang PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
9749b5e25fSSatish Balay {
986849ba73SBarry Smith   PetscErrorCode ierr;
9949b5e25fSSatish Balay 
10049b5e25fSSatish Balay   PetscFunctionBegin;
101847daeccSHong Zhang   ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr);
102847daeccSHong Zhang   Bseq->ops->mult                   = 0;
103847daeccSHong Zhang   Bseq->ops->multadd                = 0;
104847daeccSHong Zhang   Bseq->ops->multtranspose          = 0;
105847daeccSHong Zhang   Bseq->ops->multtransposeadd       = 0;
106847daeccSHong Zhang   Bseq->ops->lufactor               = 0;
107847daeccSHong Zhang   Bseq->ops->choleskyfactor         = 0;
108847daeccSHong Zhang   Bseq->ops->lufactorsymbolic       = 0;
109847daeccSHong Zhang   Bseq->ops->choleskyfactorsymbolic = 0;
110847daeccSHong Zhang   Bseq->ops->getinertia             = 0;
11149b5e25fSSatish Balay   PetscFunctionReturn(0);
11249b5e25fSSatish Balay }
11349b5e25fSSatish Balay 
1147dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
1157dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
116e50a8114SHong Zhang {
117e50a8114SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*c;
118e50a8114SHong Zhang   PetscErrorCode ierr;
119e50a8114SHong Zhang   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
120e50a8114SHong Zhang   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
121e50a8114SHong Zhang   const PetscInt *irow,*icol;
122e50a8114SHong Zhang   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
123e50a8114SHong Zhang   PetscInt       *aj = a->j,*ai = a->i;
124e50a8114SHong Zhang   MatScalar      *mat_a;
125e50a8114SHong Zhang   Mat            C;
126e50a8114SHong Zhang   PetscBool      flag;
127e50a8114SHong Zhang 
128e50a8114SHong Zhang   PetscFunctionBegin;
129e50a8114SHong Zhang 
130e50a8114SHong Zhang   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
131e50a8114SHong Zhang   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
132e50a8114SHong Zhang   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
133e50a8114SHong Zhang   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
134e50a8114SHong Zhang 
135e50a8114SHong Zhang   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
136e50a8114SHong Zhang   ssmap = smap;
137e50a8114SHong Zhang   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
138e50a8114SHong Zhang   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
139e50a8114SHong Zhang   /* determine lens of each row */
140e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
141e50a8114SHong Zhang     kstart  = ai[irow[i]];
142e50a8114SHong Zhang     kend    = kstart + a->ilen[irow[i]];
143e50a8114SHong Zhang     lens[i] = 0;
144e50a8114SHong Zhang     for (k=kstart; k<kend; k++) {
145e50a8114SHong Zhang       if (ssmap[aj[k]]) lens[i]++;
146e50a8114SHong Zhang     }
147e50a8114SHong Zhang   }
148e50a8114SHong Zhang   /* Create and fill new matrix */
149e50a8114SHong Zhang   if (scall == MAT_REUSE_MATRIX) {
150e50a8114SHong Zhang     c = (Mat_SeqSBAIJ*)((*B)->data);
151e50a8114SHong Zhang 
152e50a8114SHong Zhang     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
153580bdb30SBarry Smith     ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr);
154e50a8114SHong Zhang     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
155580bdb30SBarry Smith     ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr);
156e50a8114SHong Zhang     C    = *B;
157e50a8114SHong Zhang   } else {
158e50a8114SHong Zhang     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
159e50a8114SHong Zhang     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
160e50a8114SHong Zhang     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
161367daffbSBarry Smith     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
162e50a8114SHong Zhang   }
163e50a8114SHong Zhang   c = (Mat_SeqSBAIJ*)(C->data);
164e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
165e50a8114SHong Zhang     row      = irow[i];
166e50a8114SHong Zhang     kstart   = ai[row];
167e50a8114SHong Zhang     kend     = kstart + a->ilen[row];
168e50a8114SHong Zhang     mat_i    = c->i[i];
169e50a8114SHong Zhang     mat_j    = c->j + mat_i;
170e50a8114SHong Zhang     mat_a    = c->a + mat_i*bs2;
171e50a8114SHong Zhang     mat_ilen = c->ilen + i;
172e50a8114SHong Zhang     for (k=kstart; k<kend; k++) {
173e50a8114SHong Zhang       if ((tcol=ssmap[a->j[k]])) {
174e50a8114SHong Zhang         *mat_j++ = tcol - 1;
175580bdb30SBarry Smith         ierr     = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr);
176e50a8114SHong Zhang         mat_a   += bs2;
177e50a8114SHong Zhang         (*mat_ilen)++;
178e50a8114SHong Zhang       }
179e50a8114SHong Zhang     }
180e50a8114SHong Zhang   }
181e50a8114SHong Zhang   /* sort */
182e50a8114SHong Zhang   {
183e50a8114SHong Zhang     MatScalar *work;
184e50a8114SHong Zhang 
185e50a8114SHong Zhang     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
186e50a8114SHong Zhang     for (i=0; i<nrows; i++) {
187e50a8114SHong Zhang       PetscInt ilen;
188e50a8114SHong Zhang       mat_i = c->i[i];
189e50a8114SHong Zhang       mat_j = c->j + mat_i;
190e50a8114SHong Zhang       mat_a = c->a + mat_i*bs2;
191e50a8114SHong Zhang       ilen  = c->ilen[i];
192e50a8114SHong Zhang       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
193e50a8114SHong Zhang     }
194e50a8114SHong Zhang     ierr = PetscFree(work);CHKERRQ(ierr);
195e50a8114SHong Zhang   }
196e50a8114SHong Zhang 
197e50a8114SHong Zhang   /* Free work space */
198e50a8114SHong Zhang   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
199e50a8114SHong Zhang   ierr = PetscFree(smap);CHKERRQ(ierr);
200e50a8114SHong Zhang   ierr = PetscFree(lens);CHKERRQ(ierr);
201e50a8114SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
202e50a8114SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
203e50a8114SHong Zhang 
204e50a8114SHong Zhang   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
205e50a8114SHong Zhang   *B   = C;
206e50a8114SHong Zhang   PetscFunctionReturn(0);
207e50a8114SHong Zhang }
208e50a8114SHong Zhang 
2097dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
21049b5e25fSSatish Balay {
211e50a8114SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
212e50a8114SHong Zhang   IS             is1,is2;
2136849ba73SBarry Smith   PetscErrorCode ierr;
214e50a8114SHong Zhang   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
215e50a8114SHong Zhang   const PetscInt *irow,*icol;
21649b5e25fSSatish Balay 
21749b5e25fSSatish Balay   PetscFunctionBegin;
218e50a8114SHong Zhang   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
219e50a8114SHong Zhang   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
220e50a8114SHong Zhang   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
221e50a8114SHong Zhang   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
222e50a8114SHong Zhang 
223e50a8114SHong Zhang   /* Verify if the indices corespond to each element in a block
224e50a8114SHong Zhang    and form the IS with compressed IS */
225e50a8114SHong Zhang   maxmnbs = PetscMax(a->mbs,a->nbs);
226e50a8114SHong Zhang   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
227580bdb30SBarry Smith   ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr);
228e50a8114SHong Zhang   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
229e50a8114SHong Zhang   for (i=0; i<a->mbs; i++) {
230e50a8114SHong Zhang     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
231e50a8114SHong Zhang   }
232e50a8114SHong Zhang   count = 0;
233e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
234e50a8114SHong Zhang     PetscInt j = irow[i] / bs;
235e50a8114SHong Zhang     if ((vary[j]--)==bs) iary[count++] = j;
236e50a8114SHong Zhang   }
237e50a8114SHong Zhang   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
238e50a8114SHong Zhang 
239580bdb30SBarry Smith   ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr);
240e50a8114SHong Zhang   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
241e50a8114SHong Zhang   for (i=0; i<a->nbs; i++) {
242e50a8114SHong Zhang     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
243e50a8114SHong Zhang   }
244e50a8114SHong Zhang   count = 0;
245e50a8114SHong Zhang   for (i=0; i<ncols; i++) {
246e50a8114SHong Zhang     PetscInt j = icol[i] / bs;
247e50a8114SHong Zhang     if ((vary[j]--)==bs) iary[count++] = j;
248e50a8114SHong Zhang   }
249e50a8114SHong Zhang   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
250e50a8114SHong Zhang   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
251e50a8114SHong Zhang   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
252e50a8114SHong Zhang   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
253e50a8114SHong Zhang 
2547dae84e0SHong Zhang   ierr = MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
255e50a8114SHong Zhang   ierr = ISDestroy(&is1);CHKERRQ(ierr);
256e50a8114SHong Zhang   ierr = ISDestroy(&is2);CHKERRQ(ierr);
257847daeccSHong Zhang 
2588f46ffcaSHong Zhang   if (isrow != iscol) {
2598f46ffcaSHong Zhang     PetscBool isequal;
2608f46ffcaSHong Zhang     ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
261847daeccSHong Zhang     if (!isequal) {
262847daeccSHong Zhang       ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr);
2638f46ffcaSHong Zhang     }
26449b5e25fSSatish Balay   }
26549b5e25fSSatish Balay   PetscFunctionReturn(0);
26649b5e25fSSatish Balay }
26749b5e25fSSatish Balay 
2687dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
26949b5e25fSSatish Balay {
2706849ba73SBarry Smith   PetscErrorCode ierr;
27113f74950SBarry Smith   PetscInt       i;
27249b5e25fSSatish Balay 
27349b5e25fSSatish Balay   PetscFunctionBegin;
274e50a8114SHong Zhang   if (scall == MAT_INITIAL_MATRIX) {
275df750dc8SHong Zhang     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
276afebec48SHong Zhang   }
277e50a8114SHong Zhang 
278e50a8114SHong Zhang   for (i=0; i<n; i++) {
2797dae84e0SHong Zhang     ierr = MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
28049b5e25fSSatish Balay   }
28149b5e25fSSatish Balay   PetscFunctionReturn(0);
28249b5e25fSSatish Balay }
28349b5e25fSSatish Balay 
28449b5e25fSSatish Balay /* -------------------------------------------------------*/
28549b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */
28649b5e25fSSatish Balay /* -------------------------------------------------------*/
28749b5e25fSSatish Balay 
288dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
28949b5e25fSSatish Balay {
29049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
291d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,zero=0.0;
292d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
293d9ca1df4SBarry Smith   const MatScalar   *v;
2946849ba73SBarry Smith   PetscErrorCode    ierr;
295d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
296d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
29798c9bda7SSatish Balay   PetscInt          nonzerorow=0;
29849b5e25fSSatish Balay 
29949b5e25fSSatish Balay   PetscFunctionBegin;
3002dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
301c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
302d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3031ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
30449b5e25fSSatish Balay 
30549b5e25fSSatish Balay   v  = a->a;
30649b5e25fSSatish Balay   xb = x;
30749b5e25fSSatish Balay 
30849b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
30949b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
31049b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
31149b5e25fSSatish Balay     ib          = aj + *ai;
312831a3094SHong Zhang     jmin        = 0;
31398c9bda7SSatish Balay     nonzerorow += (n>0);
3147fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
31549b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
31649b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
317831a3094SHong Zhang       v        += 4; jmin++;
3187fbae186SHong Zhang     }
319444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
320444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
321831a3094SHong Zhang     for (j=jmin; j<n; j++) {
32249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
32349b5e25fSSatish Balay       cval       = ib[j]*2;
32449b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
32549b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
32649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
32749b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
32849b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
32949b5e25fSSatish Balay       v        += 4;
33049b5e25fSSatish Balay     }
33149b5e25fSSatish Balay     xb +=2; ai++;
33249b5e25fSSatish Balay   }
33349b5e25fSSatish Balay 
334d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
3351ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
336dc0b31edSSatish Balay   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
33749b5e25fSSatish Balay   PetscFunctionReturn(0);
33849b5e25fSSatish Balay }
33949b5e25fSSatish Balay 
340dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
34149b5e25fSSatish Balay {
34249b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
343d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,zero=0.0;
344d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
345d9ca1df4SBarry Smith   const MatScalar   *v;
3466849ba73SBarry Smith   PetscErrorCode    ierr;
347d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
348d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
34998c9bda7SSatish Balay   PetscInt          nonzerorow=0;
35049b5e25fSSatish Balay 
35149b5e25fSSatish Balay   PetscFunctionBegin;
3522dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
353c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
354d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3551ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
35649b5e25fSSatish Balay 
35749b5e25fSSatish Balay   v  = a->a;
35849b5e25fSSatish Balay   xb = x;
35949b5e25fSSatish Balay 
36049b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
36149b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
36249b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
36349b5e25fSSatish Balay     ib          = aj + *ai;
364831a3094SHong Zhang     jmin        = 0;
36598c9bda7SSatish Balay     nonzerorow += (n>0);
3667fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
36749b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
36849b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
36949b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
370831a3094SHong Zhang       v        += 9; jmin++;
3717fbae186SHong Zhang     }
372444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
373444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
374831a3094SHong Zhang     for (j=jmin; j<n; j++) {
37549b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
37649b5e25fSSatish Balay       cval       = ib[j]*3;
37749b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
37849b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
37949b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
38049b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
38149b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
38249b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
38349b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
38449b5e25fSSatish Balay       v        += 9;
38549b5e25fSSatish Balay     }
38649b5e25fSSatish Balay     xb +=3; ai++;
38749b5e25fSSatish Balay   }
38849b5e25fSSatish Balay 
389d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
3901ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
391dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
39249b5e25fSSatish Balay   PetscFunctionReturn(0);
39349b5e25fSSatish Balay }
39449b5e25fSSatish Balay 
395dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
39649b5e25fSSatish Balay {
39749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
398d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
399d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
400d9ca1df4SBarry Smith   const MatScalar   *v;
4016849ba73SBarry Smith   PetscErrorCode    ierr;
402d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
403d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
40498c9bda7SSatish Balay   PetscInt          nonzerorow = 0;
40549b5e25fSSatish Balay 
40649b5e25fSSatish Balay   PetscFunctionBegin;
4072dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
408c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
409d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4101ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
41149b5e25fSSatish Balay 
41249b5e25fSSatish Balay   v  = a->a;
41349b5e25fSSatish Balay   xb = x;
41449b5e25fSSatish Balay 
41549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
41649b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
41749b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
41849b5e25fSSatish Balay     ib          = aj + *ai;
419831a3094SHong Zhang     jmin        = 0;
42098c9bda7SSatish Balay     nonzerorow += (n>0);
4217fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
42249b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
42349b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
42449b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
42549b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
426831a3094SHong Zhang       v        += 16; jmin++;
4277fbae186SHong Zhang     }
428444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
429444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
430831a3094SHong Zhang     for (j=jmin; j<n; j++) {
43149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
43249b5e25fSSatish Balay       cval       = ib[j]*4;
43349b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
43449b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
43549b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
43649b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
43749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
43849b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
43949b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
44049b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
44149b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
44249b5e25fSSatish Balay       v        += 16;
44349b5e25fSSatish Balay     }
44449b5e25fSSatish Balay     xb +=4; ai++;
44549b5e25fSSatish Balay   }
44649b5e25fSSatish Balay 
447d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
4481ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
449dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
45049b5e25fSSatish Balay   PetscFunctionReturn(0);
45149b5e25fSSatish Balay }
45249b5e25fSSatish Balay 
453dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
45449b5e25fSSatish Balay {
45549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
456d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
457d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
458d9ca1df4SBarry Smith   const MatScalar   *v;
4596849ba73SBarry Smith   PetscErrorCode    ierr;
460d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
461d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
46298c9bda7SSatish Balay   PetscInt          nonzerorow=0;
46349b5e25fSSatish Balay 
46449b5e25fSSatish Balay   PetscFunctionBegin;
4652dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
466c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
467d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4681ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
46949b5e25fSSatish Balay 
47049b5e25fSSatish Balay   v  = a->a;
47149b5e25fSSatish Balay   xb = x;
47249b5e25fSSatish Balay 
47349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
47449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
47549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
47649b5e25fSSatish Balay     ib          = aj + *ai;
477831a3094SHong Zhang     jmin        = 0;
47898c9bda7SSatish Balay     nonzerorow += (n>0);
4797fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
48049b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
48149b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
48249b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
48349b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
48449b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
485831a3094SHong Zhang       v        += 25; jmin++;
4867fbae186SHong Zhang     }
487444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
488444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
489831a3094SHong Zhang     for (j=jmin; j<n; j++) {
49049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
49149b5e25fSSatish Balay       cval       = ib[j]*5;
49249b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
49349b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
49449b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
49549b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
49649b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
49749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
49849b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
49949b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
50049b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
50149b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
50249b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
50349b5e25fSSatish Balay       v        += 25;
50449b5e25fSSatish Balay     }
50549b5e25fSSatish Balay     xb +=5; ai++;
50649b5e25fSSatish Balay   }
50749b5e25fSSatish Balay 
508d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5091ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
510dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
51149b5e25fSSatish Balay   PetscFunctionReturn(0);
51249b5e25fSSatish Balay }
51349b5e25fSSatish Balay 
514dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
51549b5e25fSSatish Balay {
51649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
517d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
518d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
519d9ca1df4SBarry Smith   const MatScalar   *v;
5206849ba73SBarry Smith   PetscErrorCode    ierr;
521d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
522d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
52398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
52449b5e25fSSatish Balay 
52549b5e25fSSatish Balay   PetscFunctionBegin;
5262dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
527c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
528d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
5291ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
53049b5e25fSSatish Balay 
53149b5e25fSSatish Balay   v  = a->a;
53249b5e25fSSatish Balay   xb = x;
53349b5e25fSSatish Balay 
53449b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
53549b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
53649b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
53749b5e25fSSatish Balay     ib          = aj + *ai;
538831a3094SHong Zhang     jmin        = 0;
53998c9bda7SSatish Balay     nonzerorow += (n>0);
5407fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
54149b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
54249b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
54349b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
54449b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
54549b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
54649b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
547831a3094SHong Zhang       v        += 36; jmin++;
5487fbae186SHong Zhang     }
549444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
550444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
551831a3094SHong Zhang     for (j=jmin; j<n; j++) {
55249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
55349b5e25fSSatish Balay       cval       = ib[j]*6;
55449b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
55549b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
55649b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
55749b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
55849b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
55949b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
56049b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
56149b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
56249b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
56349b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
56449b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
56549b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
56649b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
56749b5e25fSSatish Balay       v        += 36;
56849b5e25fSSatish Balay     }
56949b5e25fSSatish Balay     xb +=6; ai++;
57049b5e25fSSatish Balay   }
57149b5e25fSSatish Balay 
572d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5731ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
574dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
57549b5e25fSSatish Balay   PetscFunctionReturn(0);
57649b5e25fSSatish Balay }
577*c2916339SPierre Jolivet 
578dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
57949b5e25fSSatish Balay {
58049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
581d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
582d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
583d9ca1df4SBarry Smith   const MatScalar   *v;
5846849ba73SBarry Smith   PetscErrorCode    ierr;
585d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
586d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
58798c9bda7SSatish Balay   PetscInt          nonzerorow=0;
58849b5e25fSSatish Balay 
58949b5e25fSSatish Balay   PetscFunctionBegin;
5902dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
591c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
592d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
5931ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
59449b5e25fSSatish Balay 
59549b5e25fSSatish Balay   v  = a->a;
59649b5e25fSSatish Balay   xb = x;
59749b5e25fSSatish Balay 
59849b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
59949b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
60049b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
60149b5e25fSSatish Balay     ib          = aj + *ai;
602831a3094SHong Zhang     jmin        = 0;
60398c9bda7SSatish Balay     nonzerorow += (n>0);
6047fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
60549b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
60649b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
60749b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
60849b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
60949b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
61049b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
61149b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
612831a3094SHong Zhang       v        += 49; jmin++;
6137fbae186SHong Zhang     }
614444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
615444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
616831a3094SHong Zhang     for (j=jmin; j<n; j++) {
61749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
61849b5e25fSSatish Balay       cval       = ib[j]*7;
61949b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
62049b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
62149b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
62249b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
62349b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
62449b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
62549b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
62649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
62749b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
62849b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
62949b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
63049b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
63149b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
63249b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
63349b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
63449b5e25fSSatish Balay       v       += 49;
63549b5e25fSSatish Balay     }
63649b5e25fSSatish Balay     xb +=7; ai++;
63749b5e25fSSatish Balay   }
638d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
6391ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
640dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
64149b5e25fSSatish Balay   PetscFunctionReturn(0);
64249b5e25fSSatish Balay }
64349b5e25fSSatish Balay 
64449b5e25fSSatish Balay /*
64549b5e25fSSatish Balay     This will not work with MatScalar == float because it calls the BLAS
64649b5e25fSSatish Balay */
647dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
64849b5e25fSSatish Balay {
64949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
650d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
651d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
652d9ca1df4SBarry Smith   const MatScalar   *v;
653dfbe8321SBarry Smith   PetscErrorCode    ierr;
654d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
655d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
65698c9bda7SSatish Balay   PetscInt          nonzerorow=0;
65749b5e25fSSatish Balay 
65849b5e25fSSatish Balay   PetscFunctionBegin;
6592dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
660c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
661d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x;
6621ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
66349b5e25fSSatish Balay 
66449b5e25fSSatish Balay   aj = a->j;
66549b5e25fSSatish Balay   v  = a->a;
66649b5e25fSSatish Balay   ii = a->i;
66749b5e25fSSatish Balay 
66849b5e25fSSatish Balay   if (!a->mult_work) {
669854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
67049b5e25fSSatish Balay   }
67149b5e25fSSatish Balay   work = a->mult_work;
67249b5e25fSSatish Balay 
67349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
67449b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
67549b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
67698c9bda7SSatish Balay     nonzerorow += (n>0);
67749b5e25fSSatish Balay 
67849b5e25fSSatish Balay     /* upper triangular part */
67949b5e25fSSatish Balay     for (j=0; j<n; j++) {
68049b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
68149b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
68249b5e25fSSatish Balay       workt += bs;
68349b5e25fSSatish Balay     }
68449b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
68596b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
68649b5e25fSSatish Balay 
68749b5e25fSSatish Balay     /* strict lower triangular part */
688831a3094SHong Zhang     idx = aj+ii[0];
689831a3094SHong Zhang     if (*idx == i) {
69096b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
691831a3094SHong Zhang     }
69296b9376eSHong Zhang 
69349b5e25fSSatish Balay     if (ncols > 0) {
69449b5e25fSSatish Balay       workt = work;
695580bdb30SBarry Smith       ierr  = PetscArrayzero(workt,ncols);CHKERRQ(ierr);
69696b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
697831a3094SHong Zhang       for (j=0; j<n; j++) {
698831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
69949b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
70049b5e25fSSatish Balay         workt += bs;
70149b5e25fSSatish Balay       }
70249b5e25fSSatish Balay     }
70349b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
70449b5e25fSSatish Balay   }
70549b5e25fSSatish Balay 
706d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
7071ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
708dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
70949b5e25fSSatish Balay   PetscFunctionReturn(0);
71049b5e25fSSatish Balay }
71149b5e25fSSatish Balay 
712dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
71349b5e25fSSatish Balay {
71449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
715d9ca1df4SBarry Smith   PetscScalar       *z,x1;
716d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
717d9ca1df4SBarry Smith   const MatScalar   *v;
7186849ba73SBarry Smith   PetscErrorCode    ierr;
719d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
720d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
72198c9bda7SSatish Balay   PetscInt          nonzerorow=0;
72249b5e25fSSatish Balay 
72349b5e25fSSatish Balay   PetscFunctionBegin;
724343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
725c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
726d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7271ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
72849b5e25fSSatish Balay   v    = a->a;
72949b5e25fSSatish Balay   xb   = x;
73049b5e25fSSatish Balay 
73149b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
73249b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th row of A */
73349b5e25fSSatish Balay     x1          = xb[0];
73449b5e25fSSatish Balay     ib          = aj + *ai;
735831a3094SHong Zhang     jmin        = 0;
73698c9bda7SSatish Balay     nonzerorow += (n>0);
737831a3094SHong Zhang     if (*ib == i) {            /* (diag of A)*x */
738831a3094SHong Zhang       z[i] += *v++ * x[*ib++]; jmin++;
739831a3094SHong Zhang     }
740831a3094SHong Zhang     for (j=jmin; j<n; j++) {
74149b5e25fSSatish Balay       cval    = *ib;
74249b5e25fSSatish Balay       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
74349b5e25fSSatish Balay       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
74449b5e25fSSatish Balay     }
74549b5e25fSSatish Balay     xb++; ai++;
74649b5e25fSSatish Balay   }
74749b5e25fSSatish Balay 
748d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
749bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
75049b5e25fSSatish Balay 
751dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
75249b5e25fSSatish Balay   PetscFunctionReturn(0);
75349b5e25fSSatish Balay }
75449b5e25fSSatish Balay 
755dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
75649b5e25fSSatish Balay {
75749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
758d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2;
759d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
760d9ca1df4SBarry Smith   const MatScalar   *v;
7616849ba73SBarry Smith   PetscErrorCode    ierr;
762d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
763d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
76498c9bda7SSatish Balay   PetscInt          nonzerorow=0;
76549b5e25fSSatish Balay 
76649b5e25fSSatish Balay   PetscFunctionBegin;
767343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
768c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
769d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7701ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
77149b5e25fSSatish Balay 
77249b5e25fSSatish Balay   v  = a->a;
77349b5e25fSSatish Balay   xb = x;
77449b5e25fSSatish Balay 
77549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
77649b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
77749b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
77849b5e25fSSatish Balay     ib          = aj + *ai;
779831a3094SHong Zhang     jmin        = 0;
78098c9bda7SSatish Balay     nonzerorow += (n>0);
7817fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
78249b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
78349b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
784831a3094SHong Zhang       v        += 4; jmin++;
7857fbae186SHong Zhang     }
786444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
787444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
788831a3094SHong Zhang     for (j=jmin; j<n; j++) {
78949b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
79049b5e25fSSatish Balay       cval       = ib[j]*2;
79149b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
79249b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
79349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
79449b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
79549b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
79649b5e25fSSatish Balay       v        += 4;
79749b5e25fSSatish Balay     }
79849b5e25fSSatish Balay     xb +=2; ai++;
79949b5e25fSSatish Balay   }
800d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
801bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
80249b5e25fSSatish Balay 
803dc0b31edSSatish Balay   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
80449b5e25fSSatish Balay   PetscFunctionReturn(0);
80549b5e25fSSatish Balay }
80649b5e25fSSatish Balay 
807dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
80849b5e25fSSatish Balay {
80949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
810d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3;
811d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
812d9ca1df4SBarry Smith   const MatScalar   *v;
8136849ba73SBarry Smith   PetscErrorCode    ierr;
814d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
815d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
81698c9bda7SSatish Balay   PetscInt          nonzerorow=0;
81749b5e25fSSatish Balay 
81849b5e25fSSatish Balay   PetscFunctionBegin;
819343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
820c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
821d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8221ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
82349b5e25fSSatish Balay 
82449b5e25fSSatish Balay   v  = a->a;
82549b5e25fSSatish Balay   xb = x;
82649b5e25fSSatish Balay 
82749b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
82849b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
82949b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
83049b5e25fSSatish Balay     ib          = aj + *ai;
831831a3094SHong Zhang     jmin        = 0;
83298c9bda7SSatish Balay     nonzerorow += (n>0);
8337fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
83449b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
83549b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
83649b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
837831a3094SHong Zhang       v        += 9; jmin++;
8387fbae186SHong Zhang     }
839444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
840444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
841831a3094SHong Zhang     for (j=jmin; j<n; j++) {
84249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
84349b5e25fSSatish Balay       cval       = ib[j]*3;
84449b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
84549b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
84649b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
84749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
84849b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
84949b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
85049b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
85149b5e25fSSatish Balay       v        += 9;
85249b5e25fSSatish Balay     }
85349b5e25fSSatish Balay     xb +=3; ai++;
85449b5e25fSSatish Balay   }
85549b5e25fSSatish Balay 
856d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
857bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
85849b5e25fSSatish Balay 
859dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
86049b5e25fSSatish Balay   PetscFunctionReturn(0);
86149b5e25fSSatish Balay }
86249b5e25fSSatish Balay 
863dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
86449b5e25fSSatish Balay {
86549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
866d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4;
867d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
868d9ca1df4SBarry Smith   const MatScalar   *v;
8696849ba73SBarry Smith   PetscErrorCode    ierr;
870d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
871d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
87298c9bda7SSatish Balay   PetscInt          nonzerorow=0;
87349b5e25fSSatish Balay 
87449b5e25fSSatish Balay   PetscFunctionBegin;
875343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
876c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
877d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8781ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
87949b5e25fSSatish Balay 
88049b5e25fSSatish Balay   v  = a->a;
88149b5e25fSSatish Balay   xb = x;
88249b5e25fSSatish Balay 
88349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
88449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
88549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
88649b5e25fSSatish Balay     ib          = aj + *ai;
887831a3094SHong Zhang     jmin        = 0;
88898c9bda7SSatish Balay     nonzerorow += (n>0);
8897fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
89049b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
89149b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
89249b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
89349b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
894831a3094SHong Zhang       v        += 16; jmin++;
8957fbae186SHong Zhang     }
896444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
897444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
898831a3094SHong Zhang     for (j=jmin; j<n; j++) {
89949b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
90049b5e25fSSatish Balay       cval       = ib[j]*4;
90149b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
90249b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
90349b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
90449b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
90549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
90649b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
90749b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
90849b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
90949b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
91049b5e25fSSatish Balay       v        += 16;
91149b5e25fSSatish Balay     }
91249b5e25fSSatish Balay     xb +=4; ai++;
91349b5e25fSSatish Balay   }
91449b5e25fSSatish Balay 
915d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
916bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
91749b5e25fSSatish Balay 
918dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
91949b5e25fSSatish Balay   PetscFunctionReturn(0);
92049b5e25fSSatish Balay }
92149b5e25fSSatish Balay 
922dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
92349b5e25fSSatish Balay {
92449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
925d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5;
926d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
927d9ca1df4SBarry Smith   const MatScalar   *v;
9286849ba73SBarry Smith   PetscErrorCode    ierr;
929d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
930d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
93198c9bda7SSatish Balay   PetscInt          nonzerorow=0;
93249b5e25fSSatish Balay 
93349b5e25fSSatish Balay   PetscFunctionBegin;
934343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
935c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
936d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
9371ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
93849b5e25fSSatish Balay 
93949b5e25fSSatish Balay   v  = a->a;
94049b5e25fSSatish Balay   xb = x;
94149b5e25fSSatish Balay 
94249b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
94349b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
94449b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
94549b5e25fSSatish Balay     ib          = aj + *ai;
946831a3094SHong Zhang     jmin        = 0;
94798c9bda7SSatish Balay     nonzerorow += (n>0);
9487fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
94949b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
95049b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
95149b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
95249b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
95349b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
954831a3094SHong Zhang       v        += 25; jmin++;
9557fbae186SHong Zhang     }
956444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
957444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
958831a3094SHong Zhang     for (j=jmin; j<n; j++) {
95949b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
96049b5e25fSSatish Balay       cval       = ib[j]*5;
96149b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
96249b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
96349b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
96449b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
96549b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
96649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
96749b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
96849b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
96949b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
97049b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
97149b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
97249b5e25fSSatish Balay       v        += 25;
97349b5e25fSSatish Balay     }
97449b5e25fSSatish Balay     xb +=5; ai++;
97549b5e25fSSatish Balay   }
97649b5e25fSSatish Balay 
977d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
978bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
97949b5e25fSSatish Balay 
980dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
98149b5e25fSSatish Balay   PetscFunctionReturn(0);
98249b5e25fSSatish Balay }
983*c2916339SPierre Jolivet 
984dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
98549b5e25fSSatish Balay {
98649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
987d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
988d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
989d9ca1df4SBarry Smith   const MatScalar   *v;
9906849ba73SBarry Smith   PetscErrorCode    ierr;
991d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
992d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
99398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
99449b5e25fSSatish Balay 
99549b5e25fSSatish Balay   PetscFunctionBegin;
996343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
997c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
998d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
9991ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
100049b5e25fSSatish Balay 
100149b5e25fSSatish Balay   v  = a->a;
100249b5e25fSSatish Balay   xb = x;
100349b5e25fSSatish Balay 
100449b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
100549b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
100649b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
100749b5e25fSSatish Balay     ib          = aj + *ai;
1008831a3094SHong Zhang     jmin        = 0;
100998c9bda7SSatish Balay     nonzerorow += (n>0);
10107fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
101149b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
101249b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
101349b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
101449b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
101549b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
101649b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1017831a3094SHong Zhang       v        += 36; jmin++;
10187fbae186SHong Zhang     }
1019444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1020444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1021831a3094SHong Zhang     for (j=jmin; j<n; j++) {
102249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
102349b5e25fSSatish Balay       cval       = ib[j]*6;
102449b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
102549b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
102649b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
102749b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
102849b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
102949b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
103049b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
103149b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
103249b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
103349b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
103449b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
103549b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
103649b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
103749b5e25fSSatish Balay       v        += 36;
103849b5e25fSSatish Balay     }
103949b5e25fSSatish Balay     xb +=6; ai++;
104049b5e25fSSatish Balay   }
104149b5e25fSSatish Balay 
1042d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1043bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
104449b5e25fSSatish Balay 
1045dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
104649b5e25fSSatish Balay   PetscFunctionReturn(0);
104749b5e25fSSatish Balay }
104849b5e25fSSatish Balay 
1049dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
105049b5e25fSSatish Balay {
105149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1052d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1053d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
1054d9ca1df4SBarry Smith   const MatScalar   *v;
10556849ba73SBarry Smith   PetscErrorCode    ierr;
1056d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1057d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
105898c9bda7SSatish Balay   PetscInt          nonzerorow=0;
105949b5e25fSSatish Balay 
106049b5e25fSSatish Balay   PetscFunctionBegin;
1061343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1062c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
1063d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
10641ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
106549b5e25fSSatish Balay 
106649b5e25fSSatish Balay   v  = a->a;
106749b5e25fSSatish Balay   xb = x;
106849b5e25fSSatish Balay 
106949b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
107049b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
107149b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
107249b5e25fSSatish Balay     ib          = aj + *ai;
1073831a3094SHong Zhang     jmin        = 0;
107498c9bda7SSatish Balay     nonzerorow += (n>0);
10757fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
107649b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
107749b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
107849b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
107949b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
108049b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
108149b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
108249b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1083831a3094SHong Zhang       v        += 49; jmin++;
10847fbae186SHong Zhang     }
1085444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1086444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1087831a3094SHong Zhang     for (j=jmin; j<n; j++) {
108849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
108949b5e25fSSatish Balay       cval       = ib[j]*7;
109049b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
109149b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
109249b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
109349b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
109449b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
109549b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
109649b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
109749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
109849b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
109949b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
110049b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
110149b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
110249b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
110349b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
110449b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
110549b5e25fSSatish Balay       v       += 49;
110649b5e25fSSatish Balay     }
110749b5e25fSSatish Balay     xb +=7; ai++;
110849b5e25fSSatish Balay   }
110949b5e25fSSatish Balay 
1110d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1111bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
111249b5e25fSSatish Balay 
1113dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
111449b5e25fSSatish Balay   PetscFunctionReturn(0);
111549b5e25fSSatish Balay }
111649b5e25fSSatish Balay 
1117dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
111849b5e25fSSatish Balay {
111949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1120d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1121d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
1122d9ca1df4SBarry Smith   const MatScalar   *v;
1123dfbe8321SBarry Smith   PetscErrorCode    ierr;
1124d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1125d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
112698c9bda7SSatish Balay   PetscInt          nonzerorow=0;
112749b5e25fSSatish Balay 
112849b5e25fSSatish Balay   PetscFunctionBegin;
1129343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1130c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
1131d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
11321ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
113349b5e25fSSatish Balay 
113449b5e25fSSatish Balay   aj = a->j;
113549b5e25fSSatish Balay   v  = a->a;
113649b5e25fSSatish Balay   ii = a->i;
113749b5e25fSSatish Balay 
113849b5e25fSSatish Balay   if (!a->mult_work) {
1139854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
114049b5e25fSSatish Balay   }
114149b5e25fSSatish Balay   work = a->mult_work;
114249b5e25fSSatish Balay 
114349b5e25fSSatish Balay 
114449b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
114549b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
114649b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
114798c9bda7SSatish Balay     nonzerorow += (n>0);
114849b5e25fSSatish Balay 
114949b5e25fSSatish Balay     /* upper triangular part */
115049b5e25fSSatish Balay     for (j=0; j<n; j++) {
115149b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
115249b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
115349b5e25fSSatish Balay       workt += bs;
115449b5e25fSSatish Balay     }
115549b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
115696b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
115749b5e25fSSatish Balay 
115849b5e25fSSatish Balay     /* strict lower triangular part */
1159831a3094SHong Zhang     idx = aj+ii[0];
1160831a3094SHong Zhang     if (*idx == i) {
116196b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
1162831a3094SHong Zhang     }
116349b5e25fSSatish Balay     if (ncols > 0) {
116449b5e25fSSatish Balay       workt = work;
1165580bdb30SBarry Smith       ierr  = PetscArrayzero(workt,ncols);CHKERRQ(ierr);
116696b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1167831a3094SHong Zhang       for (j=0; j<n; j++) {
1168831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
116949b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
117049b5e25fSSatish Balay         workt += bs;
117149b5e25fSSatish Balay       }
117249b5e25fSSatish Balay     }
117349b5e25fSSatish Balay 
117449b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
117549b5e25fSSatish Balay   }
117649b5e25fSSatish Balay 
1177d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1178bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
117949b5e25fSSatish Balay 
1180dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
118149b5e25fSSatish Balay   PetscFunctionReturn(0);
118249b5e25fSSatish Balay }
118349b5e25fSSatish Balay 
1184f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
118549b5e25fSSatish Balay {
118649b5e25fSSatish Balay   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1187f4df32b1SMatthew Knepley   PetscScalar    oalpha = alpha;
1188efee365bSSatish Balay   PetscErrorCode ierr;
1189c5df96a5SBarry Smith   PetscBLASInt   one = 1,totalnz;
119049b5e25fSSatish Balay 
119149b5e25fSSatish Balay   PetscFunctionBegin;
1192c5df96a5SBarry Smith   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
11938b83055fSJed Brown   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1194efee365bSSatish Balay   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
119549b5e25fSSatish Balay   PetscFunctionReturn(0);
119649b5e25fSSatish Balay }
119749b5e25fSSatish Balay 
1198dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
119949b5e25fSSatish Balay {
120049b5e25fSSatish Balay   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1201d9ca1df4SBarry Smith   const MatScalar *v       = a->a;
120249b5e25fSSatish Balay   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1203d9ca1df4SBarry Smith   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
12046849ba73SBarry Smith   PetscErrorCode  ierr;
1205d9ca1df4SBarry Smith   const PetscInt  *aj=a->j,*col;
120649b5e25fSSatish Balay 
120749b5e25fSSatish Balay   PetscFunctionBegin;
1208c40ae873SPierre Jolivet   if (!a->nz) {
1209c40ae873SPierre Jolivet     *norm = 0.0;
1210c40ae873SPierre Jolivet     PetscFunctionReturn(0);
1211c40ae873SPierre Jolivet   }
121249b5e25fSSatish Balay   if (type == NORM_FROBENIUS) {
121349b5e25fSSatish Balay     for (k=0; k<mbs; k++) {
121449b5e25fSSatish Balay       jmin = a->i[k]; jmax = a->i[k+1];
1215831a3094SHong Zhang       col  = aj + jmin;
1216831a3094SHong Zhang       if (*col == k) {         /* diagonal block */
121749b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
121849b5e25fSSatish Balay           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
121949b5e25fSSatish Balay         }
1220831a3094SHong Zhang         jmin++;
1221831a3094SHong Zhang       }
1222831a3094SHong Zhang       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
122349b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
122449b5e25fSSatish Balay           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
122549b5e25fSSatish Balay         }
122649b5e25fSSatish Balay       }
122749b5e25fSSatish Balay     }
12288f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
122951f70360SJed Brown     ierr = PetscLogFlops(2*bs2*a->nz);CHKERRQ(ierr);
12300b8dc8d2SHong Zhang   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1231dcca6d9dSJed Brown     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
12320b8dc8d2SHong Zhang     for (i=0; i<mbs; i++) jl[i] = mbs;
12330b8dc8d2SHong Zhang     il[0] = 0;
123449b5e25fSSatish Balay 
123549b5e25fSSatish Balay     *norm = 0.0;
123649b5e25fSSatish Balay     for (k=0; k<mbs; k++) { /* k_th block row */
123749b5e25fSSatish Balay       for (j=0; j<bs; j++) sum[j]=0.0;
123849b5e25fSSatish Balay       /*-- col sum --*/
123949b5e25fSSatish Balay       i = jl[k]; /* first |A(i,k)| to be added */
1240831a3094SHong Zhang       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1241831a3094SHong Zhang                   at step k */
124249b5e25fSSatish Balay       while (i<mbs) {
124349b5e25fSSatish Balay         nexti = jl[i];  /* next block row to be added */
124449b5e25fSSatish Balay         ik    = il[i];  /* block index of A(i,k) in the array a */
124549b5e25fSSatish Balay         for (j=0; j<bs; j++) {
124649b5e25fSSatish Balay           v = a->a + ik*bs2 + j*bs;
124749b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
124849b5e25fSSatish Balay             sum[j] += PetscAbsScalar(*v); v++;
124949b5e25fSSatish Balay           }
125049b5e25fSSatish Balay         }
125149b5e25fSSatish Balay         /* update il, jl */
1252831a3094SHong Zhang         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1253831a3094SHong Zhang         jmax = a->i[i+1];
125449b5e25fSSatish Balay         if (jmin < jmax) {
125549b5e25fSSatish Balay           il[i] = jmin;
125649b5e25fSSatish Balay           j     = a->j[jmin];
125749b5e25fSSatish Balay           jl[i] = jl[j]; jl[j]=i;
125849b5e25fSSatish Balay         }
125949b5e25fSSatish Balay         i = nexti;
126049b5e25fSSatish Balay       }
126149b5e25fSSatish Balay       /*-- row sum --*/
126249b5e25fSSatish Balay       jmin = a->i[k]; jmax = a->i[k+1];
126349b5e25fSSatish Balay       for (i=jmin; i<jmax; i++) {
126449b5e25fSSatish Balay         for (j=0; j<bs; j++) {
126549b5e25fSSatish Balay           v = a->a + i*bs2 + j;
126649b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
12670b8dc8d2SHong Zhang             sum[j] += PetscAbsScalar(*v); v += bs;
126849b5e25fSSatish Balay           }
126949b5e25fSSatish Balay         }
127049b5e25fSSatish Balay       }
127149b5e25fSSatish Balay       /* add k_th block row to il, jl */
1272831a3094SHong Zhang       col = aj+jmin;
1273831a3094SHong Zhang       if (*col == k) jmin++;
127449b5e25fSSatish Balay       if (jmin < jmax) {
127549b5e25fSSatish Balay         il[k] = jmin;
12760b8dc8d2SHong Zhang         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
127749b5e25fSSatish Balay       }
127849b5e25fSSatish Balay       for (j=0; j<bs; j++) {
127949b5e25fSSatish Balay         if (sum[j] > *norm) *norm = sum[j];
128049b5e25fSSatish Balay       }
128149b5e25fSSatish Balay     }
128274ed9c26SBarry Smith     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
128351f70360SJed Brown     ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr);
1284f23aa3ddSBarry Smith   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
128549b5e25fSSatish Balay   PetscFunctionReturn(0);
128649b5e25fSSatish Balay }
128749b5e25fSSatish Balay 
1288ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
128949b5e25fSSatish Balay {
129049b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1291dfbe8321SBarry Smith   PetscErrorCode ierr;
129249b5e25fSSatish Balay 
129349b5e25fSSatish Balay   PetscFunctionBegin;
129449b5e25fSSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1295d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1296ef511fbeSHong Zhang     *flg = PETSC_FALSE;
1297ef511fbeSHong Zhang     PetscFunctionReturn(0);
129849b5e25fSSatish Balay   }
129949b5e25fSSatish Balay 
130049b5e25fSSatish Balay   /* if the a->i are the same */
1301580bdb30SBarry Smith   ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr);
130226fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
130349b5e25fSSatish Balay 
130449b5e25fSSatish Balay   /* if a->j are the same */
1305580bdb30SBarry Smith   ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr);
130626fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
130726fbe8dcSKarl Rupp 
130849b5e25fSSatish Balay   /* if a->a are the same */
1309580bdb30SBarry Smith   ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg);CHKERRQ(ierr);
1310935af2e7SHong Zhang   PetscFunctionReturn(0);
131149b5e25fSSatish Balay }
131249b5e25fSSatish Balay 
1313dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
131449b5e25fSSatish Balay {
131549b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1316dfbe8321SBarry Smith   PetscErrorCode  ierr;
1317d9ca1df4SBarry Smith   PetscInt        i,j,k,row,bs,ambs,bs2;
1318d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
131987828ca2SBarry Smith   PetscScalar     *x,zero = 0.0;
1320d9ca1df4SBarry Smith   const MatScalar *aa,*aa_j;
132149b5e25fSSatish Balay 
132249b5e25fSSatish Balay   PetscFunctionBegin;
1323d0f46423SBarry Smith   bs = A->rmap->bs;
1324e32f2f54SBarry Smith   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
132582799104SHong Zhang 
132649b5e25fSSatish Balay   aa   = a->a;
13278a0c6efdSHong Zhang   ambs = a->mbs;
13288a0c6efdSHong Zhang 
13298a0c6efdSHong Zhang   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
13308a0c6efdSHong Zhang     PetscInt *diag=a->diag;
13318a0c6efdSHong Zhang     aa   = a->a;
13328a0c6efdSHong Zhang     ambs = a->mbs;
13338a0c6efdSHong Zhang     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
13348a0c6efdSHong Zhang     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
13358a0c6efdSHong Zhang     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
13368a0c6efdSHong Zhang     PetscFunctionReturn(0);
13378a0c6efdSHong Zhang   }
13388a0c6efdSHong Zhang 
133949b5e25fSSatish Balay   ai   = a->i;
134049b5e25fSSatish Balay   aj   = a->j;
134149b5e25fSSatish Balay   bs2  = a->bs2;
13422dcb1b2aSMatthew Knepley   ierr = VecSet(v,zero);CHKERRQ(ierr);
1343c40ae873SPierre Jolivet   if (!a->nz) PetscFunctionReturn(0);
13441ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
134549b5e25fSSatish Balay   for (i=0; i<ambs; i++) {
134649b5e25fSSatish Balay     j=ai[i];
134749b5e25fSSatish Balay     if (aj[j] == i) {    /* if this is a diagonal element */
134849b5e25fSSatish Balay       row  = i*bs;
134949b5e25fSSatish Balay       aa_j = aa + j*bs2;
135049b5e25fSSatish Balay       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
135149b5e25fSSatish Balay     }
135249b5e25fSSatish Balay   }
13531ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
135449b5e25fSSatish Balay   PetscFunctionReturn(0);
135549b5e25fSSatish Balay }
135649b5e25fSSatish Balay 
1357dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
135849b5e25fSSatish Balay {
135949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1360d9ca1df4SBarry Smith   PetscScalar       x;
1361d9ca1df4SBarry Smith   const PetscScalar *l,*li,*ri;
136249b5e25fSSatish Balay   MatScalar         *aa,*v;
1363dfbe8321SBarry Smith   PetscErrorCode    ierr;
1364fff8e43fSBarry Smith   PetscInt          i,j,k,lm,M,m,mbs,tmp,bs,bs2;
1365fff8e43fSBarry Smith   const PetscInt    *ai,*aj;
1366ace3abfcSBarry Smith   PetscBool         flg;
136749b5e25fSSatish Balay 
136849b5e25fSSatish Balay   PetscFunctionBegin;
1369b3bf805bSHong Zhang   if (ll != rr) {
1370b3bf805bSHong Zhang     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1371e7e72b3dSBarry Smith     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1372b3bf805bSHong Zhang   }
1373b3bf805bSHong Zhang   if (!ll) PetscFunctionReturn(0);
137449b5e25fSSatish Balay   ai  = a->i;
137549b5e25fSSatish Balay   aj  = a->j;
137649b5e25fSSatish Balay   aa  = a->a;
1377d0f46423SBarry Smith   m   = A->rmap->N;
1378d0f46423SBarry Smith   bs  = A->rmap->bs;
137949b5e25fSSatish Balay   mbs = a->mbs;
138049b5e25fSSatish Balay   bs2 = a->bs2;
138149b5e25fSSatish Balay 
1382d9ca1df4SBarry Smith   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
138349b5e25fSSatish Balay   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1384e32f2f54SBarry Smith   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
138549b5e25fSSatish Balay   for (i=0; i<mbs; i++) { /* for each block row */
138649b5e25fSSatish Balay     M  = ai[i+1] - ai[i];
138749b5e25fSSatish Balay     li = l + i*bs;
138849b5e25fSSatish Balay     v  = aa + bs2*ai[i];
138949b5e25fSSatish Balay     for (j=0; j<M; j++) { /* for each block */
139049b5e25fSSatish Balay       ri = l + bs*aj[ai[i]+j];
13915e90f9d9SHong Zhang       for (k=0; k<bs; k++) {
139249b5e25fSSatish Balay         x = ri[k];
139349b5e25fSSatish Balay         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
139449b5e25fSSatish Balay       }
139549b5e25fSSatish Balay     }
139649b5e25fSSatish Balay   }
1397d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1398dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
139949b5e25fSSatish Balay   PetscFunctionReturn(0);
140049b5e25fSSatish Balay }
140149b5e25fSSatish Balay 
1402dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
140349b5e25fSSatish Balay {
140449b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
140549b5e25fSSatish Balay 
140649b5e25fSSatish Balay   PetscFunctionBegin;
140749b5e25fSSatish Balay   info->block_size   = a->bs2;
1408ceed8ce5SJed Brown   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
14096c6c5352SBarry Smith   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
14103966268fSBarry Smith   info->nz_unneeded  = info->nz_allocated - info->nz_used;
141149b5e25fSSatish Balay   info->assemblies   = A->num_ass;
14128e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
14137adad957SLisandro Dalcin   info->memory       = ((PetscObject)A)->mem;
1414d5f3da31SBarry Smith   if (A->factortype) {
141549b5e25fSSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
141649b5e25fSSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
141749b5e25fSSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
141849b5e25fSSatish Balay   } else {
141949b5e25fSSatish Balay     info->fill_ratio_given  = 0;
142049b5e25fSSatish Balay     info->fill_ratio_needed = 0;
142149b5e25fSSatish Balay     info->factor_mallocs    = 0;
142249b5e25fSSatish Balay   }
142349b5e25fSSatish Balay   PetscFunctionReturn(0);
142449b5e25fSSatish Balay }
142549b5e25fSSatish Balay 
1426dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
142749b5e25fSSatish Balay {
142849b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1429dfbe8321SBarry Smith   PetscErrorCode ierr;
143049b5e25fSSatish Balay 
143149b5e25fSSatish Balay   PetscFunctionBegin;
1432580bdb30SBarry Smith   ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr);
143349b5e25fSSatish Balay   PetscFunctionReturn(0);
143449b5e25fSSatish Balay }
1435dc354874SHong Zhang 
1436985db425SBarry Smith /*
1437985db425SBarry Smith    This code does not work since it only checks the upper triangular part of
1438985db425SBarry Smith   the matrix. Hence it is not listed in the function table.
1439985db425SBarry Smith */
1440985db425SBarry Smith PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1441dc354874SHong Zhang {
1442dc354874SHong Zhang   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1443dfbe8321SBarry Smith   PetscErrorCode  ierr;
1444d9ca1df4SBarry Smith   PetscInt        i,j,n,row,col,bs,mbs;
1445d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
1446c3fca9a7SHong Zhang   PetscReal       atmp;
1447d9ca1df4SBarry Smith   const MatScalar *aa;
1448985db425SBarry Smith   PetscScalar     *x;
144913f74950SBarry Smith   PetscInt        ncols,brow,bcol,krow,kcol;
1450dc354874SHong Zhang 
1451dc354874SHong Zhang   PetscFunctionBegin;
1452e32f2f54SBarry Smith   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1453e32f2f54SBarry Smith   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1454d0f46423SBarry Smith   bs  = A->rmap->bs;
1455dc354874SHong Zhang   aa  = a->a;
1456dc354874SHong Zhang   ai  = a->i;
1457dc354874SHong Zhang   aj  = a->j;
145844117c81SHong Zhang   mbs = a->mbs;
1459dc354874SHong Zhang 
1460985db425SBarry Smith   ierr = VecSet(v,0.0);CHKERRQ(ierr);
14611ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1462dc354874SHong Zhang   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1463e32f2f54SBarry Smith   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
146444117c81SHong Zhang   for (i=0; i<mbs; i++) {
1465d0f6400bSHong Zhang     ncols = ai[1] - ai[0]; ai++;
1466d0f6400bSHong Zhang     brow  = bs*i;
146744117c81SHong Zhang     for (j=0; j<ncols; j++) {
1468d0f6400bSHong Zhang       bcol = bs*(*aj);
146944117c81SHong Zhang       for (kcol=0; kcol<bs; kcol++) {
1470d0f6400bSHong Zhang         col = bcol + kcol;      /* col index */
147144117c81SHong Zhang         for (krow=0; krow<bs; krow++) {
1472d0f6400bSHong Zhang           atmp = PetscAbsScalar(*aa); aa++;
1473d0f6400bSHong Zhang           row  = brow + krow;   /* row index */
1474c3fca9a7SHong Zhang           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1475c3fca9a7SHong Zhang           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
147644117c81SHong Zhang         }
147744117c81SHong Zhang       }
1478d0f6400bSHong Zhang       aj++;
1479dc354874SHong Zhang     }
1480dc354874SHong Zhang   }
14811ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1482dc354874SHong Zhang   PetscFunctionReturn(0);
1483dc354874SHong Zhang }
1484*c2916339SPierre Jolivet 
1485*c2916339SPierre Jolivet PETSC_INTERN PetscErrorCode MatMatMult_SeqSBAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
1486*c2916339SPierre Jolivet {
1487*c2916339SPierre Jolivet   PetscErrorCode ierr;
1488*c2916339SPierre Jolivet 
1489*c2916339SPierre Jolivet   PetscFunctionBegin;
1490*c2916339SPierre Jolivet   if (scall == MAT_INITIAL_MATRIX) {
1491*c2916339SPierre Jolivet     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
1492*c2916339SPierre Jolivet     ierr = MatMatMultSymbolic_SeqSBAIJ_SeqDense(A,B,fill,C);CHKERRQ(ierr);
1493*c2916339SPierre Jolivet     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
1494*c2916339SPierre Jolivet   }
1495*c2916339SPierre Jolivet   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
1496*c2916339SPierre Jolivet   ierr = MatMatMultNumeric_SeqSBAIJ_SeqDense(A,B,*C);CHKERRQ(ierr);
1497*c2916339SPierre Jolivet   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
1498*c2916339SPierre Jolivet   PetscFunctionReturn(0);
1499*c2916339SPierre Jolivet }
1500*c2916339SPierre Jolivet 
1501*c2916339SPierre Jolivet PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
1502*c2916339SPierre Jolivet {
1503*c2916339SPierre Jolivet   PetscErrorCode ierr;
1504*c2916339SPierre Jolivet 
1505*c2916339SPierre Jolivet   PetscFunctionBegin;
1506*c2916339SPierre Jolivet   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
1507*c2916339SPierre Jolivet 
1508*c2916339SPierre Jolivet   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1509*c2916339SPierre Jolivet   PetscFunctionReturn(0);
1510*c2916339SPierre Jolivet }
1511*c2916339SPierre Jolivet 
1512*c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1513*c2916339SPierre Jolivet {
1514*c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1515*c2916339SPierre Jolivet   PetscScalar       *z = c;
1516*c2916339SPierre Jolivet   const PetscScalar *xb;
1517*c2916339SPierre Jolivet   PetscScalar       x1;
1518*c2916339SPierre Jolivet   const MatScalar   *v = a->a,*vv;
1519*c2916339SPierre Jolivet   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1520*c2916339SPierre Jolivet 
1521*c2916339SPierre Jolivet   PetscFunctionBegin;
1522*c2916339SPierre Jolivet   for (i=0; i<mbs; i++) {
1523*c2916339SPierre Jolivet     n = ii[1] - ii[0]; ii++;
1524*c2916339SPierre Jolivet     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1525*c2916339SPierre Jolivet     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1526*c2916339SPierre Jolivet     jj = idx;
1527*c2916339SPierre Jolivet     vv = v;
1528*c2916339SPierre Jolivet     for (k=0; k<cn; k++) {
1529*c2916339SPierre Jolivet       idx = jj;
1530*c2916339SPierre Jolivet       v = vv;
1531*c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1532*c2916339SPierre Jolivet         xb = b + (*idx); x1 = xb[0+k*bm];
1533*c2916339SPierre Jolivet         z[0+k*cm] += v[0]*x1;
1534*c2916339SPierre Jolivet         if (*idx != i) c[(*idx)+k*cm] += v[0]*b[i+k*bm];
1535*c2916339SPierre Jolivet         v += 1;
1536*c2916339SPierre Jolivet         ++idx;
1537*c2916339SPierre Jolivet       }
1538*c2916339SPierre Jolivet     }
1539*c2916339SPierre Jolivet     z += 1;
1540*c2916339SPierre Jolivet   }
1541*c2916339SPierre Jolivet   PetscFunctionReturn(0);
1542*c2916339SPierre Jolivet }
1543*c2916339SPierre Jolivet 
1544*c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1545*c2916339SPierre Jolivet {
1546*c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1547*c2916339SPierre Jolivet   PetscScalar       *z = c;
1548*c2916339SPierre Jolivet   const PetscScalar *xb;
1549*c2916339SPierre Jolivet   PetscScalar       x1,x2;
1550*c2916339SPierre Jolivet   const MatScalar   *v = a->a,*vv;
1551*c2916339SPierre Jolivet   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1552*c2916339SPierre Jolivet 
1553*c2916339SPierre Jolivet   PetscFunctionBegin;
1554*c2916339SPierre Jolivet   for (i=0; i<mbs; i++) {
1555*c2916339SPierre Jolivet     n = ii[1] - ii[0]; ii++;
1556*c2916339SPierre Jolivet     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1557*c2916339SPierre Jolivet     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1558*c2916339SPierre Jolivet     jj = idx;
1559*c2916339SPierre Jolivet     vv = v;
1560*c2916339SPierre Jolivet     for (k=0; k<cn; k++) {
1561*c2916339SPierre Jolivet       idx = jj;
1562*c2916339SPierre Jolivet       v = vv;
1563*c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1564*c2916339SPierre Jolivet         xb    = b + 2*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
1565*c2916339SPierre Jolivet         z[0+k*cm] += v[0]*x1 + v[2]*x2;
1566*c2916339SPierre Jolivet         z[1+k*cm] += v[1]*x1 + v[3]*x2;
1567*c2916339SPierre Jolivet         if (*idx != i) {
1568*c2916339SPierre Jolivet           c[2*(*idx)+0+k*cm] += v[0]*b[2*i+k*bm] + v[1]*b[2*i+1+k*bm];
1569*c2916339SPierre Jolivet           c[2*(*idx)+1+k*cm] += v[2]*b[2*i+k*bm] + v[3]*b[2*i+1+k*bm];
1570*c2916339SPierre Jolivet         }
1571*c2916339SPierre Jolivet         v += 4;
1572*c2916339SPierre Jolivet         ++idx;
1573*c2916339SPierre Jolivet       }
1574*c2916339SPierre Jolivet     }
1575*c2916339SPierre Jolivet     z += 2;
1576*c2916339SPierre Jolivet   }
1577*c2916339SPierre Jolivet   PetscFunctionReturn(0);
1578*c2916339SPierre Jolivet }
1579*c2916339SPierre Jolivet 
1580*c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1581*c2916339SPierre Jolivet {
1582*c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1583*c2916339SPierre Jolivet   PetscScalar       *z = c;
1584*c2916339SPierre Jolivet   const PetscScalar *xb;
1585*c2916339SPierre Jolivet   PetscScalar       x1,x2,x3;
1586*c2916339SPierre Jolivet   const MatScalar   *v = a->a,*vv;
1587*c2916339SPierre Jolivet   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1588*c2916339SPierre Jolivet 
1589*c2916339SPierre Jolivet   PetscFunctionBegin;
1590*c2916339SPierre Jolivet   for (i=0; i<mbs; i++) {
1591*c2916339SPierre Jolivet     n = ii[1] - ii[0]; ii++;
1592*c2916339SPierre Jolivet     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1593*c2916339SPierre Jolivet     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1594*c2916339SPierre Jolivet     jj = idx;
1595*c2916339SPierre Jolivet     vv = v;
1596*c2916339SPierre Jolivet     for (k=0; k<cn; k++) {
1597*c2916339SPierre Jolivet       idx = jj;
1598*c2916339SPierre Jolivet       v = vv;
1599*c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1600*c2916339SPierre Jolivet         xb = b + 3*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
1601*c2916339SPierre Jolivet         z[0+k*cm] += v[0]*x1 + v[3]*x2 + v[6]*x3;
1602*c2916339SPierre Jolivet         z[1+k*cm] += v[1]*x1 + v[4]*x2 + v[7]*x3;
1603*c2916339SPierre Jolivet         z[2+k*cm] += v[2]*x1 + v[5]*x2 + v[8]*x3;
1604*c2916339SPierre Jolivet         if (*idx != i) {
1605*c2916339SPierre Jolivet           c[3*(*idx)+0+k*cm] += v[0]*b[3*i+k*bm] + v[3]*b[3*i+1+k*bm] + v[6]*b[3*i+2+k*bm];
1606*c2916339SPierre Jolivet           c[3*(*idx)+1+k*cm] += v[1]*b[3*i+k*bm] + v[4]*b[3*i+1+k*bm] + v[7]*b[3*i+2+k*bm];
1607*c2916339SPierre Jolivet           c[3*(*idx)+2+k*cm] += v[2]*b[3*i+k*bm] + v[5]*b[3*i+1+k*bm] + v[8]*b[3*i+2+k*bm];
1608*c2916339SPierre Jolivet         }
1609*c2916339SPierre Jolivet         v += 9;
1610*c2916339SPierre Jolivet         ++idx;
1611*c2916339SPierre Jolivet       }
1612*c2916339SPierre Jolivet     }
1613*c2916339SPierre Jolivet     z += 3;
1614*c2916339SPierre Jolivet   }
1615*c2916339SPierre Jolivet   PetscFunctionReturn(0);
1616*c2916339SPierre Jolivet }
1617*c2916339SPierre Jolivet 
1618*c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1619*c2916339SPierre Jolivet {
1620*c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1621*c2916339SPierre Jolivet   PetscScalar       *z = c;
1622*c2916339SPierre Jolivet   const PetscScalar *xb;
1623*c2916339SPierre Jolivet   PetscScalar       x1,x2,x3,x4;
1624*c2916339SPierre Jolivet   const MatScalar   *v = a->a,*vv;
1625*c2916339SPierre Jolivet   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1626*c2916339SPierre Jolivet 
1627*c2916339SPierre Jolivet   PetscFunctionBegin;
1628*c2916339SPierre Jolivet   for (i=0; i<mbs; i++) {
1629*c2916339SPierre Jolivet     n = ii[1] - ii[0]; ii++;
1630*c2916339SPierre Jolivet     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1631*c2916339SPierre Jolivet     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1632*c2916339SPierre Jolivet     jj = idx;
1633*c2916339SPierre Jolivet     vv = v;
1634*c2916339SPierre Jolivet     for (k=0; k<cn; k++) {
1635*c2916339SPierre Jolivet       idx = jj;
1636*c2916339SPierre Jolivet       v = vv;
1637*c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1638*c2916339SPierre Jolivet         xb = b + 4*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
1639*c2916339SPierre Jolivet         z[0+k*cm] += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1640*c2916339SPierre Jolivet         z[1+k*cm] += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1641*c2916339SPierre Jolivet         z[2+k*cm] += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1642*c2916339SPierre Jolivet         z[3+k*cm] += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1643*c2916339SPierre Jolivet         if (*idx != i) {
1644*c2916339SPierre Jolivet           c[4*(*idx)+0+k*cm] += v[0]*b[4*i+k*bm] + v[4]*b[4*i+1+k*bm] + v[8]*b[4*i+2+k*bm]  + v[12]*b[4*i+3+k*bm];
1645*c2916339SPierre Jolivet           c[4*(*idx)+1+k*cm] += v[1]*b[4*i+k*bm] + v[5]*b[4*i+1+k*bm] + v[9]*b[4*i+2+k*bm]  + v[13]*b[4*i+3+k*bm];
1646*c2916339SPierre Jolivet           c[4*(*idx)+2+k*cm] += v[2]*b[4*i+k*bm] + v[6]*b[4*i+1+k*bm] + v[10]*b[4*i+2+k*bm] + v[14]*b[4*i+3+k*bm];
1647*c2916339SPierre Jolivet           c[4*(*idx)+3+k*cm] += v[3]*b[4*i+k*bm] + v[7]*b[4*i+1+k*bm] + v[11]*b[4*i+2+k*bm] + v[15]*b[4*i+3+k*bm];
1648*c2916339SPierre Jolivet         }
1649*c2916339SPierre Jolivet         v += 16;
1650*c2916339SPierre Jolivet         ++idx;
1651*c2916339SPierre Jolivet       }
1652*c2916339SPierre Jolivet     }
1653*c2916339SPierre Jolivet     z += 4;
1654*c2916339SPierre Jolivet   }
1655*c2916339SPierre Jolivet   PetscFunctionReturn(0);
1656*c2916339SPierre Jolivet }
1657*c2916339SPierre Jolivet 
1658*c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1659*c2916339SPierre Jolivet {
1660*c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1661*c2916339SPierre Jolivet   PetscScalar       *z = c;
1662*c2916339SPierre Jolivet   const PetscScalar *xb;
1663*c2916339SPierre Jolivet   PetscScalar       x1,x2,x3,x4,x5;
1664*c2916339SPierre Jolivet   const MatScalar   *v = a->a,*vv;
1665*c2916339SPierre Jolivet   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1666*c2916339SPierre Jolivet 
1667*c2916339SPierre Jolivet   PetscFunctionBegin;
1668*c2916339SPierre Jolivet   for (i=0; i<mbs; i++) {
1669*c2916339SPierre Jolivet     n = ii[1] - ii[0]; ii++;
1670*c2916339SPierre Jolivet     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1671*c2916339SPierre Jolivet     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1672*c2916339SPierre Jolivet     jj = idx;
1673*c2916339SPierre Jolivet     vv = v;
1674*c2916339SPierre Jolivet     for (k=0; k<cn; k++) {
1675*c2916339SPierre Jolivet       idx = jj;
1676*c2916339SPierre Jolivet       v = vv;
1677*c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1678*c2916339SPierre Jolivet         xb = b + 5*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*cm];
1679*c2916339SPierre Jolivet         z[0+k*cm] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1680*c2916339SPierre Jolivet         z[1+k*cm] += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1681*c2916339SPierre Jolivet         z[2+k*cm] += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1682*c2916339SPierre Jolivet         z[3+k*cm] += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1683*c2916339SPierre Jolivet         z[4+k*cm] += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1684*c2916339SPierre Jolivet         if (*idx != i) {
1685*c2916339SPierre Jolivet           c[5*(*idx)+0+k*cm] += v[0]*b[5*i+k*bm] + v[5]*b[5*i+1+k*bm] + v[10]*b[5*i+2+k*bm] + v[15]*b[5*i+3+k*bm] + v[20]*b[5*i+4+k*bm];
1686*c2916339SPierre Jolivet           c[5*(*idx)+1+k*cm] += v[1]*b[5*i+k*bm] + v[6]*b[5*i+1+k*bm] + v[11]*b[5*i+2+k*bm] + v[16]*b[5*i+3+k*bm] + v[21]*b[5*i+4+k*bm];
1687*c2916339SPierre Jolivet           c[5*(*idx)+2+k*cm] += v[2]*b[5*i+k*bm] + v[7]*b[5*i+1+k*bm] + v[12]*b[5*i+2+k*bm] + v[17]*b[5*i+3+k*bm] + v[22]*b[5*i+4+k*bm];
1688*c2916339SPierre Jolivet           c[5*(*idx)+3+k*cm] += v[3]*b[5*i+k*bm] + v[8]*b[5*i+1+k*bm] + v[13]*b[5*i+2+k*bm] + v[18]*b[5*i+3+k*bm] + v[23]*b[5*i+4+k*bm];
1689*c2916339SPierre Jolivet           c[5*(*idx)+4+k*cm] += v[4]*b[5*i+k*bm] + v[9]*b[5*i+1+k*bm] + v[14]*b[5*i+2+k*bm] + v[19]*b[5*i+3+k*bm] + v[24]*b[5*i+4+k*bm];
1690*c2916339SPierre Jolivet         }
1691*c2916339SPierre Jolivet         v += 25;
1692*c2916339SPierre Jolivet         ++idx;
1693*c2916339SPierre Jolivet       }
1694*c2916339SPierre Jolivet     }
1695*c2916339SPierre Jolivet     z += 5;
1696*c2916339SPierre Jolivet   }
1697*c2916339SPierre Jolivet   PetscFunctionReturn(0);
1698*c2916339SPierre Jolivet }
1699*c2916339SPierre Jolivet 
1700*c2916339SPierre Jolivet PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C)
1701*c2916339SPierre Jolivet {
1702*c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1703*c2916339SPierre Jolivet   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
1704*c2916339SPierre Jolivet   Mat_SeqDense      *cd = (Mat_SeqDense*)B->data;
1705*c2916339SPierre Jolivet   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
1706*c2916339SPierre Jolivet   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1707*c2916339SPierre Jolivet   PetscBLASInt      bbs,bcn,bbm,bcm;
1708*c2916339SPierre Jolivet   PetscScalar       *z = 0;
1709*c2916339SPierre Jolivet   PetscScalar       *c,*b;
1710*c2916339SPierre Jolivet   const MatScalar   *v;
1711*c2916339SPierre Jolivet   const PetscInt    *idx,*ii;
1712*c2916339SPierre Jolivet   PetscScalar       _DOne=1.0;
1713*c2916339SPierre Jolivet   PetscErrorCode    ierr;
1714*c2916339SPierre Jolivet 
1715*c2916339SPierre Jolivet   PetscFunctionBegin;
1716*c2916339SPierre Jolivet   if (!cm || !cn) PetscFunctionReturn(0);
1717*c2916339SPierre Jolivet   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
1718*c2916339SPierre Jolivet   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1719*c2916339SPierre Jolivet   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1720*c2916339SPierre Jolivet   b = bd->v;
1721*c2916339SPierre Jolivet   ierr = MatZeroEntries(C);CHKERRQ(ierr);
1722*c2916339SPierre Jolivet   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1723*c2916339SPierre Jolivet   switch (bs) {
1724*c2916339SPierre Jolivet   case 1:
1725*c2916339SPierre Jolivet     ierr = MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn);
1726*c2916339SPierre Jolivet     break;
1727*c2916339SPierre Jolivet   case 2:
1728*c2916339SPierre Jolivet     ierr = MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn);
1729*c2916339SPierre Jolivet     break;
1730*c2916339SPierre Jolivet   case 3:
1731*c2916339SPierre Jolivet     ierr = MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn);
1732*c2916339SPierre Jolivet     break;
1733*c2916339SPierre Jolivet   case 4:
1734*c2916339SPierre Jolivet     ierr = MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn);
1735*c2916339SPierre Jolivet     break;
1736*c2916339SPierre Jolivet   case 5:
1737*c2916339SPierre Jolivet     ierr = MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn);
1738*c2916339SPierre Jolivet     break;
1739*c2916339SPierre Jolivet   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1740*c2916339SPierre Jolivet     ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr);
1741*c2916339SPierre Jolivet     ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr);
1742*c2916339SPierre Jolivet     ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr);
1743*c2916339SPierre Jolivet     ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr);
1744*c2916339SPierre Jolivet     idx = a->j;
1745*c2916339SPierre Jolivet     v   = a->a;
1746*c2916339SPierre Jolivet     mbs = a->mbs;
1747*c2916339SPierre Jolivet     ii  = a->i;
1748*c2916339SPierre Jolivet     z   = c;
1749*c2916339SPierre Jolivet     for (i=0; i<mbs; i++) {
1750*c2916339SPierre Jolivet       n = ii[1] - ii[0]; ii++;
1751*c2916339SPierre Jolivet       for (j=0; j<n; j++) {
1752*c2916339SPierre Jolivet         if (*idx != i)
1753*c2916339SPierre Jolivet           PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*i,&bbm,&_DOne,c+bs*(*idx),&bcm));
1754*c2916339SPierre Jolivet         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
1755*c2916339SPierre Jolivet         v += bs2;
1756*c2916339SPierre Jolivet       }
1757*c2916339SPierre Jolivet       z += bs;
1758*c2916339SPierre Jolivet     }
1759*c2916339SPierre Jolivet   }
1760*c2916339SPierre Jolivet   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1761*c2916339SPierre Jolivet   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1762*c2916339SPierre Jolivet   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1763*c2916339SPierre Jolivet   ierr = PetscLogFlops((2.0*(a->nz*2.0 - a->nonzerorowcnt)*bs2 - a->nonzerorowcnt)*cn);CHKERRQ(ierr);
1764*c2916339SPierre Jolivet   PetscFunctionReturn(0);
1765*c2916339SPierre Jolivet }
1766