xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 367daffbeca5f1514f664d9be3910d7b6e19736f)
149b5e25fSSatish Balay 
2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
3af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
4c6db04a5SJed Brown #include <petscbt.h>
5c6db04a5SJed Brown #include <../src/mat/impls/sbaij/seq/sbaij.h>
6c6db04a5SJed Brown #include <petscblaslapack.h>
749b5e25fSSatish Balay 
84a2ae208SSatish Balay #undef __FUNCT__
94a2ae208SSatish Balay #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
1013f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
1149b5e25fSSatish Balay {
125eee224dSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
136849ba73SBarry Smith   PetscErrorCode ierr;
145d0c19d7SBarry Smith   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
155d0c19d7SBarry Smith   const PetscInt *idx;
16db41cccfSHong Zhang   PetscBT        table_out,table_in;
17d94109b8SHong Zhang 
18d94109b8SHong Zhang   PetscFunctionBegin;
19e32f2f54SBarry Smith   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20d94109b8SHong Zhang   mbs  = a->mbs;
21d94109b8SHong Zhang   ai   = a->i;
22d94109b8SHong Zhang   aj   = a->j;
23d0f46423SBarry Smith   bs   = A->rmap->bs;
2453b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
25854ce69bSBarry Smith   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
26854ce69bSBarry Smith   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
2753b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
28d94109b8SHong Zhang 
29d94109b8SHong Zhang   for (i=0; i<is_max; i++) { /* for each is */
30d94109b8SHong Zhang     isz  = 0;
31db41cccfSHong Zhang     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
32d94109b8SHong Zhang 
33d94109b8SHong Zhang     /* Extract the indices, assume there can be duplicate entries */
34d94109b8SHong Zhang     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35d94109b8SHong Zhang     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36d94109b8SHong Zhang 
37db41cccfSHong Zhang     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
38dbe03f88SHong Zhang     bcol_max = 0;
39d94109b8SHong Zhang     for (j=0; j<n; ++j) {
40d94109b8SHong Zhang       brow = idx[j]/bs; /* convert the indices into block indices */
41e32f2f54SBarry Smith       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42db41cccfSHong Zhang       if (!PetscBTLookupSet(table_out,brow)) {
43dbe03f88SHong Zhang         nidx[isz++] = brow;
44dbe03f88SHong Zhang         if (bcol_max < brow) bcol_max = brow;
45dbe03f88SHong Zhang       }
46d94109b8SHong Zhang     }
47d94109b8SHong Zhang     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
486bf464f9SBarry Smith     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
49d94109b8SHong Zhang 
50d94109b8SHong Zhang     k = 0;
51d94109b8SHong Zhang     for (j=0; j<ov; j++) { /* for each overlap */
52db41cccfSHong Zhang       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53db41cccfSHong Zhang       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
54db41cccfSHong Zhang       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
55d94109b8SHong Zhang 
56d94109b8SHong Zhang       n = isz;  /* length of the updated is[i] */
57d94109b8SHong Zhang       for (brow=0; brow<mbs; brow++) {
58d94109b8SHong Zhang         start = ai[brow]; end   = ai[brow+1];
59db41cccfSHong Zhang         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
60d94109b8SHong Zhang           for (l = start; l<end; l++) {
61d94109b8SHong Zhang             bcol = aj[l];
62d7b97159SHong Zhang             if (!PetscBTLookupSet(table_out,bcol)) {
63d7b97159SHong Zhang               nidx[isz++] = bcol;
64d7b97159SHong Zhang               if (bcol_max < bcol) bcol_max = bcol;
65d7b97159SHong Zhang             }
66d94109b8SHong Zhang           }
67d94109b8SHong Zhang           k++;
68d94109b8SHong Zhang           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
69d94109b8SHong Zhang         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
70d94109b8SHong Zhang           for (l = start; l<end; l++) {
71d94109b8SHong Zhang             bcol = aj[l];
72dbe03f88SHong Zhang             if (bcol > bcol_max) break;
73db41cccfSHong Zhang             if (PetscBTLookup(table_in,bcol)) {
7426fbe8dcSKarl Rupp               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
75d94109b8SHong Zhang               break; /* for l = start; l<end ; l++) */
76d94109b8SHong Zhang             }
77d94109b8SHong Zhang           }
78d94109b8SHong Zhang         }
79d94109b8SHong Zhang       }
80d94109b8SHong Zhang     } /* for each overlap */
81d94109b8SHong Zhang 
82d94109b8SHong Zhang     /* expand the Index Set */
83d94109b8SHong Zhang     for (j=0; j<isz; j++) {
8426fbe8dcSKarl Rupp       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
85d94109b8SHong Zhang     }
8670b3c8c7SBarry Smith     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
87d94109b8SHong Zhang   }
8894bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
89d94109b8SHong Zhang   ierr = PetscFree(nidx);CHKERRQ(ierr);
90d94109b8SHong Zhang   ierr = PetscFree(nidx2);CHKERRQ(ierr);
9194bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
925eee224dSHong Zhang   PetscFunctionReturn(0);
9349b5e25fSSatish Balay }
9449b5e25fSSatish Balay 
95847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
96847daeccSHong Zhang         Zero some ops' to avoid invalid usse */
974a2ae208SSatish Balay #undef __FUNCT__
98847daeccSHong Zhang #define __FUNCT__ "MatSeqSBAIJZeroOps_Private"
99847daeccSHong Zhang PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
10049b5e25fSSatish Balay {
1016849ba73SBarry Smith   PetscErrorCode ierr;
10249b5e25fSSatish Balay 
10349b5e25fSSatish Balay   PetscFunctionBegin;
104847daeccSHong Zhang   ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr);
105847daeccSHong Zhang   Bseq->ops->mult                   = 0;
106847daeccSHong Zhang   Bseq->ops->multadd                = 0;
107847daeccSHong Zhang   Bseq->ops->multtranspose          = 0;
108847daeccSHong Zhang   Bseq->ops->multtransposeadd       = 0;
109847daeccSHong Zhang   Bseq->ops->lufactor               = 0;
110847daeccSHong Zhang   Bseq->ops->choleskyfactor         = 0;
111847daeccSHong Zhang   Bseq->ops->lufactorsymbolic       = 0;
112847daeccSHong Zhang   Bseq->ops->choleskyfactorsymbolic = 0;
113847daeccSHong Zhang   Bseq->ops->getinertia             = 0;
11449b5e25fSSatish Balay   PetscFunctionReturn(0);
11549b5e25fSSatish Balay }
11649b5e25fSSatish Balay 
117e50a8114SHong Zhang /* same as MatGetSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
118e50a8114SHong Zhang #undef __FUNCT__
119e50a8114SHong Zhang #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
120e50a8114SHong Zhang PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
121e50a8114SHong Zhang {
122e50a8114SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*c;
123e50a8114SHong Zhang   PetscErrorCode ierr;
124e50a8114SHong Zhang   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
125e50a8114SHong Zhang   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
126e50a8114SHong Zhang   const PetscInt *irow,*icol;
127e50a8114SHong Zhang   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
128e50a8114SHong Zhang   PetscInt       *aj = a->j,*ai = a->i;
129e50a8114SHong Zhang   MatScalar      *mat_a;
130e50a8114SHong Zhang   Mat            C;
131e50a8114SHong Zhang   PetscBool      flag;
132e50a8114SHong Zhang 
133e50a8114SHong Zhang   PetscFunctionBegin;
134e50a8114SHong Zhang 
135e50a8114SHong Zhang   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
136e50a8114SHong Zhang   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
137e50a8114SHong Zhang   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
138e50a8114SHong Zhang   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
139e50a8114SHong Zhang 
140e50a8114SHong Zhang   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
141e50a8114SHong Zhang   ssmap = smap;
142e50a8114SHong Zhang   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
143e50a8114SHong Zhang   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
144e50a8114SHong Zhang   /* determine lens of each row */
145e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
146e50a8114SHong Zhang     kstart  = ai[irow[i]];
147e50a8114SHong Zhang     kend    = kstart + a->ilen[irow[i]];
148e50a8114SHong Zhang     lens[i] = 0;
149e50a8114SHong Zhang     for (k=kstart; k<kend; k++) {
150e50a8114SHong Zhang       if (ssmap[aj[k]]) lens[i]++;
151e50a8114SHong Zhang     }
152e50a8114SHong Zhang   }
153e50a8114SHong Zhang   /* Create and fill new matrix */
154e50a8114SHong Zhang   if (scall == MAT_REUSE_MATRIX) {
155e50a8114SHong Zhang     c = (Mat_SeqSBAIJ*)((*B)->data);
156e50a8114SHong Zhang 
157e50a8114SHong Zhang     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
158e50a8114SHong Zhang     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
159e50a8114SHong Zhang     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
160e50a8114SHong Zhang     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
161e50a8114SHong Zhang     C    = *B;
162e50a8114SHong Zhang   } else {
163e50a8114SHong Zhang     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
164e50a8114SHong Zhang     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
165e50a8114SHong Zhang     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
166*367daffbSBarry Smith     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
167e50a8114SHong Zhang   }
168e50a8114SHong Zhang   c = (Mat_SeqSBAIJ*)(C->data);
169e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
170e50a8114SHong Zhang     row      = irow[i];
171e50a8114SHong Zhang     kstart   = ai[row];
172e50a8114SHong Zhang     kend     = kstart + a->ilen[row];
173e50a8114SHong Zhang     mat_i    = c->i[i];
174e50a8114SHong Zhang     mat_j    = c->j + mat_i;
175e50a8114SHong Zhang     mat_a    = c->a + mat_i*bs2;
176e50a8114SHong Zhang     mat_ilen = c->ilen + i;
177e50a8114SHong Zhang     for (k=kstart; k<kend; k++) {
178e50a8114SHong Zhang       if ((tcol=ssmap[a->j[k]])) {
179e50a8114SHong Zhang         *mat_j++ = tcol - 1;
180e50a8114SHong Zhang         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
181e50a8114SHong Zhang         mat_a   += bs2;
182e50a8114SHong Zhang         (*mat_ilen)++;
183e50a8114SHong Zhang       }
184e50a8114SHong Zhang     }
185e50a8114SHong Zhang   }
186e50a8114SHong Zhang   /* sort */
187e50a8114SHong Zhang   {
188e50a8114SHong Zhang     MatScalar *work;
189e50a8114SHong Zhang 
190e50a8114SHong Zhang     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
191e50a8114SHong Zhang     for (i=0; i<nrows; i++) {
192e50a8114SHong Zhang       PetscInt ilen;
193e50a8114SHong Zhang       mat_i = c->i[i];
194e50a8114SHong Zhang       mat_j = c->j + mat_i;
195e50a8114SHong Zhang       mat_a = c->a + mat_i*bs2;
196e50a8114SHong Zhang       ilen  = c->ilen[i];
197e50a8114SHong Zhang       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
198e50a8114SHong Zhang     }
199e50a8114SHong Zhang     ierr = PetscFree(work);CHKERRQ(ierr);
200e50a8114SHong Zhang   }
201e50a8114SHong Zhang 
202e50a8114SHong Zhang   /* Free work space */
203e50a8114SHong Zhang   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
204e50a8114SHong Zhang   ierr = PetscFree(smap);CHKERRQ(ierr);
205e50a8114SHong Zhang   ierr = PetscFree(lens);CHKERRQ(ierr);
206e50a8114SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
207e50a8114SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
208e50a8114SHong Zhang 
209e50a8114SHong Zhang   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
210e50a8114SHong Zhang   *B   = C;
211e50a8114SHong Zhang   PetscFunctionReturn(0);
212e50a8114SHong Zhang }
213e50a8114SHong Zhang 
2144a2ae208SSatish Balay #undef __FUNCT__
2154a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
2164aa3045dSJed Brown PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
21749b5e25fSSatish Balay {
218e50a8114SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
219e50a8114SHong Zhang   IS             is1,is2;
2206849ba73SBarry Smith   PetscErrorCode ierr;
221e50a8114SHong Zhang   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
222e50a8114SHong Zhang   const PetscInt *irow,*icol;
22349b5e25fSSatish Balay 
22449b5e25fSSatish Balay   PetscFunctionBegin;
225e50a8114SHong Zhang   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
226e50a8114SHong Zhang   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
227e50a8114SHong Zhang   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
228e50a8114SHong Zhang   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
229e50a8114SHong Zhang 
230e50a8114SHong Zhang   /* Verify if the indices corespond to each element in a block
231e50a8114SHong Zhang    and form the IS with compressed IS */
232e50a8114SHong Zhang   maxmnbs = PetscMax(a->mbs,a->nbs);
233e50a8114SHong Zhang   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
234e50a8114SHong Zhang   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
235e50a8114SHong Zhang   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
236e50a8114SHong Zhang   for (i=0; i<a->mbs; i++) {
237e50a8114SHong Zhang     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
238e50a8114SHong Zhang   }
239e50a8114SHong Zhang   count = 0;
240e50a8114SHong Zhang   for (i=0; i<nrows; i++) {
241e50a8114SHong Zhang     PetscInt j = irow[i] / bs;
242e50a8114SHong Zhang     if ((vary[j]--)==bs) iary[count++] = j;
243e50a8114SHong Zhang   }
244e50a8114SHong Zhang   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
245e50a8114SHong Zhang 
246e50a8114SHong Zhang   ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr);
247e50a8114SHong Zhang   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
248e50a8114SHong Zhang   for (i=0; i<a->nbs; i++) {
249e50a8114SHong Zhang     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
250e50a8114SHong Zhang   }
251e50a8114SHong Zhang   count = 0;
252e50a8114SHong Zhang   for (i=0; i<ncols; i++) {
253e50a8114SHong Zhang     PetscInt j = icol[i] / bs;
254e50a8114SHong Zhang     if ((vary[j]--)==bs) iary[count++] = j;
255e50a8114SHong Zhang   }
256e50a8114SHong Zhang   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
257e50a8114SHong Zhang   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
258e50a8114SHong Zhang   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
259e50a8114SHong Zhang   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
260e50a8114SHong Zhang 
261e50a8114SHong Zhang   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
262e50a8114SHong Zhang   ierr = ISDestroy(&is1);CHKERRQ(ierr);
263e50a8114SHong Zhang   ierr = ISDestroy(&is2);CHKERRQ(ierr);
264847daeccSHong Zhang 
2658f46ffcaSHong Zhang   if (isrow != iscol) {
2668f46ffcaSHong Zhang     PetscBool isequal;
2678f46ffcaSHong Zhang     ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
268847daeccSHong Zhang     if (!isequal) {
269847daeccSHong Zhang       ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr);
2708f46ffcaSHong Zhang     }
27149b5e25fSSatish Balay   }
27249b5e25fSSatish Balay   PetscFunctionReturn(0);
27349b5e25fSSatish Balay }
27449b5e25fSSatish Balay 
2754a2ae208SSatish Balay #undef __FUNCT__
2764a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
27713f74950SBarry Smith PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
27849b5e25fSSatish Balay {
2796849ba73SBarry Smith   PetscErrorCode ierr;
28013f74950SBarry Smith   PetscInt       i;
28149b5e25fSSatish Balay 
28249b5e25fSSatish Balay   PetscFunctionBegin;
283e50a8114SHong Zhang   if (scall == MAT_INITIAL_MATRIX) {
284e50a8114SHong Zhang     ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr);
285afebec48SHong Zhang   }
286e50a8114SHong Zhang 
287e50a8114SHong Zhang   for (i=0; i<n; i++) {
288e50a8114SHong Zhang     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
28949b5e25fSSatish Balay   }
29049b5e25fSSatish Balay   PetscFunctionReturn(0);
29149b5e25fSSatish Balay }
29249b5e25fSSatish Balay 
29349b5e25fSSatish Balay /* -------------------------------------------------------*/
29449b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */
29549b5e25fSSatish Balay /* -------------------------------------------------------*/
29649b5e25fSSatish Balay 
2974a2ae208SSatish Balay #undef __FUNCT__
2984a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_2"
299dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
30049b5e25fSSatish Balay {
30149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
302d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,zero=0.0;
303d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
304d9ca1df4SBarry Smith   const MatScalar   *v;
3056849ba73SBarry Smith   PetscErrorCode    ierr;
306d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
307d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
30898c9bda7SSatish Balay   PetscInt          nonzerorow=0;
30949b5e25fSSatish Balay 
31049b5e25fSSatish Balay   PetscFunctionBegin;
3112dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
312d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3131ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
31449b5e25fSSatish Balay 
31549b5e25fSSatish Balay   v  = a->a;
31649b5e25fSSatish Balay   xb = x;
31749b5e25fSSatish Balay 
31849b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
31949b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
32049b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
32149b5e25fSSatish Balay     ib          = aj + *ai;
322831a3094SHong Zhang     jmin        = 0;
32398c9bda7SSatish Balay     nonzerorow += (n>0);
3247fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
32549b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
32649b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
327831a3094SHong Zhang       v        += 4; jmin++;
3287fbae186SHong Zhang     }
329444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
330444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
331831a3094SHong Zhang     for (j=jmin; j<n; j++) {
33249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
33349b5e25fSSatish Balay       cval       = ib[j]*2;
33449b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
33549b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
33649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
33749b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
33849b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
33949b5e25fSSatish Balay       v        += 4;
34049b5e25fSSatish Balay     }
34149b5e25fSSatish Balay     xb +=2; ai++;
34249b5e25fSSatish Balay   }
34349b5e25fSSatish Balay 
344d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
3451ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
346dc0b31edSSatish Balay   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
34749b5e25fSSatish Balay   PetscFunctionReturn(0);
34849b5e25fSSatish Balay }
34949b5e25fSSatish Balay 
3504a2ae208SSatish Balay #undef __FUNCT__
3514a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_3"
352dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
35349b5e25fSSatish Balay {
35449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
355d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,zero=0.0;
356d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
357d9ca1df4SBarry Smith   const MatScalar   *v;
3586849ba73SBarry Smith   PetscErrorCode    ierr;
359d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
360d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
36198c9bda7SSatish Balay   PetscInt          nonzerorow=0;
36249b5e25fSSatish Balay 
36349b5e25fSSatish Balay   PetscFunctionBegin;
3642dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
365d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3661ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
36749b5e25fSSatish Balay 
36849b5e25fSSatish Balay   v  = a->a;
36949b5e25fSSatish Balay   xb = x;
37049b5e25fSSatish Balay 
37149b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
37249b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
37349b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
37449b5e25fSSatish Balay     ib          = aj + *ai;
375831a3094SHong Zhang     jmin        = 0;
37698c9bda7SSatish Balay     nonzerorow += (n>0);
3777fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
37849b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
37949b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
38049b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
381831a3094SHong Zhang       v        += 9; jmin++;
3827fbae186SHong Zhang     }
383444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
384444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
385831a3094SHong Zhang     for (j=jmin; j<n; j++) {
38649b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
38749b5e25fSSatish Balay       cval       = ib[j]*3;
38849b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
38949b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
39049b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
39149b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
39249b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
39349b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
39449b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
39549b5e25fSSatish Balay       v        += 9;
39649b5e25fSSatish Balay     }
39749b5e25fSSatish Balay     xb +=3; ai++;
39849b5e25fSSatish Balay   }
39949b5e25fSSatish Balay 
400d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
4011ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
402dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
40349b5e25fSSatish Balay   PetscFunctionReturn(0);
40449b5e25fSSatish Balay }
40549b5e25fSSatish Balay 
4064a2ae208SSatish Balay #undef __FUNCT__
4074a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_4"
408dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
40949b5e25fSSatish Balay {
41049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
411d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
412d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
413d9ca1df4SBarry Smith   const MatScalar   *v;
4146849ba73SBarry Smith   PetscErrorCode    ierr;
415d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
416d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
41798c9bda7SSatish Balay   PetscInt          nonzerorow = 0;
41849b5e25fSSatish Balay 
41949b5e25fSSatish Balay   PetscFunctionBegin;
4202dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
421d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4221ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
42349b5e25fSSatish Balay 
42449b5e25fSSatish Balay   v  = a->a;
42549b5e25fSSatish Balay   xb = x;
42649b5e25fSSatish Balay 
42749b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
42849b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
42949b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
43049b5e25fSSatish Balay     ib          = aj + *ai;
431831a3094SHong Zhang     jmin        = 0;
43298c9bda7SSatish Balay     nonzerorow += (n>0);
4337fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
43449b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
43549b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
43649b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
43749b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
438831a3094SHong Zhang       v        += 16; jmin++;
4397fbae186SHong Zhang     }
440444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
441444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
442831a3094SHong Zhang     for (j=jmin; j<n; j++) {
44349b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
44449b5e25fSSatish Balay       cval       = ib[j]*4;
44549b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
44649b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
44749b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
44849b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
44949b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
45049b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
45149b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
45249b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
45349b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
45449b5e25fSSatish Balay       v        += 16;
45549b5e25fSSatish Balay     }
45649b5e25fSSatish Balay     xb +=4; ai++;
45749b5e25fSSatish Balay   }
45849b5e25fSSatish Balay 
459d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
4601ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
461dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
46249b5e25fSSatish Balay   PetscFunctionReturn(0);
46349b5e25fSSatish Balay }
46449b5e25fSSatish Balay 
4654a2ae208SSatish Balay #undef __FUNCT__
4664a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_5"
467dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
46849b5e25fSSatish Balay {
46949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
470d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
471d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
472d9ca1df4SBarry Smith   const MatScalar   *v;
4736849ba73SBarry Smith   PetscErrorCode    ierr;
474d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
475d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
47698c9bda7SSatish Balay   PetscInt          nonzerorow=0;
47749b5e25fSSatish Balay 
47849b5e25fSSatish Balay   PetscFunctionBegin;
4792dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
480d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4811ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
48249b5e25fSSatish Balay 
48349b5e25fSSatish Balay   v  = a->a;
48449b5e25fSSatish Balay   xb = x;
48549b5e25fSSatish Balay 
48649b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
48749b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
48849b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
48949b5e25fSSatish Balay     ib          = aj + *ai;
490831a3094SHong Zhang     jmin        = 0;
49198c9bda7SSatish Balay     nonzerorow += (n>0);
4927fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
49349b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
49449b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
49549b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
49649b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
49749b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
498831a3094SHong Zhang       v        += 25; jmin++;
4997fbae186SHong Zhang     }
500444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
501444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
502831a3094SHong Zhang     for (j=jmin; j<n; j++) {
50349b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
50449b5e25fSSatish Balay       cval       = ib[j]*5;
50549b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
50649b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
50749b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
50849b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
50949b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
51049b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
51149b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
51249b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
51349b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
51449b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
51549b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
51649b5e25fSSatish Balay       v        += 25;
51749b5e25fSSatish Balay     }
51849b5e25fSSatish Balay     xb +=5; ai++;
51949b5e25fSSatish Balay   }
52049b5e25fSSatish Balay 
521d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5221ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
523dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
52449b5e25fSSatish Balay   PetscFunctionReturn(0);
52549b5e25fSSatish Balay }
52649b5e25fSSatish Balay 
52749b5e25fSSatish Balay 
5284a2ae208SSatish Balay #undef __FUNCT__
5294a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_6"
530dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
53149b5e25fSSatish Balay {
53249b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
533d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
534d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
535d9ca1df4SBarry Smith   const MatScalar   *v;
5366849ba73SBarry Smith   PetscErrorCode    ierr;
537d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
538d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
53998c9bda7SSatish Balay   PetscInt          nonzerorow=0;
54049b5e25fSSatish Balay 
54149b5e25fSSatish Balay   PetscFunctionBegin;
5422dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
543d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
5441ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
54549b5e25fSSatish Balay 
54649b5e25fSSatish Balay   v  = a->a;
54749b5e25fSSatish Balay   xb = x;
54849b5e25fSSatish Balay 
54949b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
55049b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
55149b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
55249b5e25fSSatish Balay     ib          = aj + *ai;
553831a3094SHong Zhang     jmin        = 0;
55498c9bda7SSatish Balay     nonzerorow += (n>0);
5557fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
55649b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
55749b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
55849b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
55949b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
56049b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
56149b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
562831a3094SHong Zhang       v        += 36; jmin++;
5637fbae186SHong Zhang     }
564444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
565444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
566831a3094SHong Zhang     for (j=jmin; j<n; j++) {
56749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
56849b5e25fSSatish Balay       cval       = ib[j]*6;
56949b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
57049b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
57149b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
57249b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
57349b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
57449b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
57549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
57649b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
57749b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
57849b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
57949b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
58049b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
58149b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
58249b5e25fSSatish Balay       v        += 36;
58349b5e25fSSatish Balay     }
58449b5e25fSSatish Balay     xb +=6; ai++;
58549b5e25fSSatish Balay   }
58649b5e25fSSatish Balay 
587d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5881ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
589dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
59049b5e25fSSatish Balay   PetscFunctionReturn(0);
59149b5e25fSSatish Balay }
5924a2ae208SSatish Balay #undef __FUNCT__
5934a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_7"
594dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
59549b5e25fSSatish Balay {
59649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
597d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
598d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
599d9ca1df4SBarry Smith   const MatScalar   *v;
6006849ba73SBarry Smith   PetscErrorCode    ierr;
601d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
602d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
60398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
60449b5e25fSSatish Balay 
60549b5e25fSSatish Balay   PetscFunctionBegin;
6062dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
607d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
6081ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
60949b5e25fSSatish Balay 
61049b5e25fSSatish Balay   v  = a->a;
61149b5e25fSSatish Balay   xb = x;
61249b5e25fSSatish Balay 
61349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
61449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
61549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
61649b5e25fSSatish Balay     ib          = aj + *ai;
617831a3094SHong Zhang     jmin        = 0;
61898c9bda7SSatish Balay     nonzerorow += (n>0);
6197fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
62049b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
62149b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
62249b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
62349b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
62449b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
62549b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
62649b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
627831a3094SHong Zhang       v        += 49; jmin++;
6287fbae186SHong Zhang     }
629444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
630444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
631831a3094SHong Zhang     for (j=jmin; j<n; j++) {
63249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
63349b5e25fSSatish Balay       cval       = ib[j]*7;
63449b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
63549b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
63649b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
63749b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
63849b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
63949b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
64049b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
64149b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
64249b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
64349b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
64449b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
64549b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
64649b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
64749b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
64849b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
64949b5e25fSSatish Balay       v       += 49;
65049b5e25fSSatish Balay     }
65149b5e25fSSatish Balay     xb +=7; ai++;
65249b5e25fSSatish Balay   }
653d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
6541ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
655dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
65649b5e25fSSatish Balay   PetscFunctionReturn(0);
65749b5e25fSSatish Balay }
65849b5e25fSSatish Balay 
65949b5e25fSSatish Balay /*
66049b5e25fSSatish Balay     This will not work with MatScalar == float because it calls the BLAS
66149b5e25fSSatish Balay */
6624a2ae208SSatish Balay #undef __FUNCT__
6634a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_N"
664dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
66549b5e25fSSatish Balay {
66649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
667d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
668d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
669d9ca1df4SBarry Smith   const MatScalar   *v;
670dfbe8321SBarry Smith   PetscErrorCode    ierr;
671d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
672d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
67398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
67449b5e25fSSatish Balay 
67549b5e25fSSatish Balay   PetscFunctionBegin;
6762dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
677d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x;
6781ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
67949b5e25fSSatish Balay 
68049b5e25fSSatish Balay   aj = a->j;
68149b5e25fSSatish Balay   v  = a->a;
68249b5e25fSSatish Balay   ii = a->i;
68349b5e25fSSatish Balay 
68449b5e25fSSatish Balay   if (!a->mult_work) {
685854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
68649b5e25fSSatish Balay   }
68749b5e25fSSatish Balay   work = a->mult_work;
68849b5e25fSSatish Balay 
68949b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
69049b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
69149b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
69298c9bda7SSatish Balay     nonzerorow += (n>0);
69349b5e25fSSatish Balay 
69449b5e25fSSatish Balay     /* upper triangular part */
69549b5e25fSSatish Balay     for (j=0; j<n; j++) {
69649b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
69749b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
69849b5e25fSSatish Balay       workt += bs;
69949b5e25fSSatish Balay     }
70049b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
70196b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
70249b5e25fSSatish Balay 
70349b5e25fSSatish Balay     /* strict lower triangular part */
704831a3094SHong Zhang     idx = aj+ii[0];
705831a3094SHong Zhang     if (*idx == i) {
70696b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
707831a3094SHong Zhang     }
70896b9376eSHong Zhang 
70949b5e25fSSatish Balay     if (ncols > 0) {
71049b5e25fSSatish Balay       workt = work;
71187828ca2SBarry Smith       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
71296b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
713831a3094SHong Zhang       for (j=0; j<n; j++) {
714831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
71549b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
71649b5e25fSSatish Balay         workt += bs;
71749b5e25fSSatish Balay       }
71849b5e25fSSatish Balay     }
71949b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
72049b5e25fSSatish Balay   }
72149b5e25fSSatish Balay 
722d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
7231ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
724dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
72549b5e25fSSatish Balay   PetscFunctionReturn(0);
72649b5e25fSSatish Balay }
72749b5e25fSSatish Balay 
7284a2ae208SSatish Balay #undef __FUNCT__
7294a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
730dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
73149b5e25fSSatish Balay {
73249b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
733d9ca1df4SBarry Smith   PetscScalar       *z,x1;
734d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
735d9ca1df4SBarry Smith   const MatScalar   *v;
7366849ba73SBarry Smith   PetscErrorCode    ierr;
737d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
738d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
73998c9bda7SSatish Balay   PetscInt          nonzerorow=0;
74049b5e25fSSatish Balay 
74149b5e25fSSatish Balay   PetscFunctionBegin;
742343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
743d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7441ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
74549b5e25fSSatish Balay   v    = a->a;
74649b5e25fSSatish Balay   xb   = x;
74749b5e25fSSatish Balay 
74849b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
74949b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th row of A */
75049b5e25fSSatish Balay     x1          = xb[0];
75149b5e25fSSatish Balay     ib          = aj + *ai;
752831a3094SHong Zhang     jmin        = 0;
75398c9bda7SSatish Balay     nonzerorow += (n>0);
754831a3094SHong Zhang     if (*ib == i) {            /* (diag of A)*x */
755831a3094SHong Zhang       z[i] += *v++ * x[*ib++]; jmin++;
756831a3094SHong Zhang     }
757831a3094SHong Zhang     for (j=jmin; j<n; j++) {
75849b5e25fSSatish Balay       cval    = *ib;
75949b5e25fSSatish Balay       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
76049b5e25fSSatish Balay       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
76149b5e25fSSatish Balay     }
76249b5e25fSSatish Balay     xb++; ai++;
76349b5e25fSSatish Balay   }
76449b5e25fSSatish Balay 
765d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
766bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
76749b5e25fSSatish Balay 
768dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
76949b5e25fSSatish Balay   PetscFunctionReturn(0);
77049b5e25fSSatish Balay }
77149b5e25fSSatish Balay 
7724a2ae208SSatish Balay #undef __FUNCT__
7734a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
774dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
77549b5e25fSSatish Balay {
77649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
777d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2;
778d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
779d9ca1df4SBarry Smith   const MatScalar   *v;
7806849ba73SBarry Smith   PetscErrorCode    ierr;
781d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
782d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
78398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
78449b5e25fSSatish Balay 
78549b5e25fSSatish Balay   PetscFunctionBegin;
786343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
787d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7881ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
78949b5e25fSSatish Balay 
79049b5e25fSSatish Balay   v  = a->a;
79149b5e25fSSatish Balay   xb = x;
79249b5e25fSSatish Balay 
79349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
79449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
79549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
79649b5e25fSSatish Balay     ib          = aj + *ai;
797831a3094SHong Zhang     jmin        = 0;
79898c9bda7SSatish Balay     nonzerorow += (n>0);
7997fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
80049b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
80149b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
802831a3094SHong Zhang       v        += 4; jmin++;
8037fbae186SHong Zhang     }
804444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
805444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
806831a3094SHong Zhang     for (j=jmin; j<n; j++) {
80749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
80849b5e25fSSatish Balay       cval       = ib[j]*2;
80949b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
81049b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
81149b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
81249b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
81349b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
81449b5e25fSSatish Balay       v        += 4;
81549b5e25fSSatish Balay     }
81649b5e25fSSatish Balay     xb +=2; ai++;
81749b5e25fSSatish Balay   }
818d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
819bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
82049b5e25fSSatish Balay 
821dc0b31edSSatish Balay   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
82249b5e25fSSatish Balay   PetscFunctionReturn(0);
82349b5e25fSSatish Balay }
82449b5e25fSSatish Balay 
8254a2ae208SSatish Balay #undef __FUNCT__
8264a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
827dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
82849b5e25fSSatish Balay {
82949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
830d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3;
831d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
832d9ca1df4SBarry Smith   const MatScalar   *v;
8336849ba73SBarry Smith   PetscErrorCode    ierr;
834d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
835d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
83698c9bda7SSatish Balay   PetscInt          nonzerorow=0;
83749b5e25fSSatish Balay 
83849b5e25fSSatish Balay   PetscFunctionBegin;
839343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
840d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8411ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
84249b5e25fSSatish Balay 
84349b5e25fSSatish Balay   v  = a->a;
84449b5e25fSSatish Balay   xb = x;
84549b5e25fSSatish Balay 
84649b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
84749b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
84849b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
84949b5e25fSSatish Balay     ib          = aj + *ai;
850831a3094SHong Zhang     jmin        = 0;
85198c9bda7SSatish Balay     nonzerorow += (n>0);
8527fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
85349b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
85449b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
85549b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
856831a3094SHong Zhang       v        += 9; jmin++;
8577fbae186SHong Zhang     }
858444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
859444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
860831a3094SHong Zhang     for (j=jmin; j<n; j++) {
86149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
86249b5e25fSSatish Balay       cval       = ib[j]*3;
86349b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
86449b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
86549b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
86649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
86749b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
86849b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
86949b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
87049b5e25fSSatish Balay       v        += 9;
87149b5e25fSSatish Balay     }
87249b5e25fSSatish Balay     xb +=3; ai++;
87349b5e25fSSatish Balay   }
87449b5e25fSSatish Balay 
875d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
876bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
87749b5e25fSSatish Balay 
878dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
87949b5e25fSSatish Balay   PetscFunctionReturn(0);
88049b5e25fSSatish Balay }
88149b5e25fSSatish Balay 
8824a2ae208SSatish Balay #undef __FUNCT__
8834a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
884dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
88549b5e25fSSatish Balay {
88649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
887d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4;
888d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
889d9ca1df4SBarry Smith   const MatScalar   *v;
8906849ba73SBarry Smith   PetscErrorCode    ierr;
891d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
892d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
89398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
89449b5e25fSSatish Balay 
89549b5e25fSSatish Balay   PetscFunctionBegin;
896343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
897d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8981ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
89949b5e25fSSatish Balay 
90049b5e25fSSatish Balay   v  = a->a;
90149b5e25fSSatish Balay   xb = x;
90249b5e25fSSatish Balay 
90349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
90449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
90549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
90649b5e25fSSatish Balay     ib          = aj + *ai;
907831a3094SHong Zhang     jmin        = 0;
90898c9bda7SSatish Balay     nonzerorow += (n>0);
9097fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
91049b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
91149b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
91249b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
91349b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
914831a3094SHong Zhang       v        += 16; jmin++;
9157fbae186SHong Zhang     }
916444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
917444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
918831a3094SHong Zhang     for (j=jmin; j<n; j++) {
91949b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
92049b5e25fSSatish Balay       cval       = ib[j]*4;
92149b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
92249b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
92349b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
92449b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
92549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
92649b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
92749b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
92849b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
92949b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
93049b5e25fSSatish Balay       v        += 16;
93149b5e25fSSatish Balay     }
93249b5e25fSSatish Balay     xb +=4; ai++;
93349b5e25fSSatish Balay   }
93449b5e25fSSatish Balay 
935d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
936bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
93749b5e25fSSatish Balay 
938dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
93949b5e25fSSatish Balay   PetscFunctionReturn(0);
94049b5e25fSSatish Balay }
94149b5e25fSSatish Balay 
9424a2ae208SSatish Balay #undef __FUNCT__
9434a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
944dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
94549b5e25fSSatish Balay {
94649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
947d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5;
948d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
949d9ca1df4SBarry Smith   const MatScalar   *v;
9506849ba73SBarry Smith   PetscErrorCode    ierr;
951d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
952d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
95398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
95449b5e25fSSatish Balay 
95549b5e25fSSatish Balay   PetscFunctionBegin;
956343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
957d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
9581ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
95949b5e25fSSatish Balay 
96049b5e25fSSatish Balay   v  = a->a;
96149b5e25fSSatish Balay   xb = x;
96249b5e25fSSatish Balay 
96349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
96449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
96549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
96649b5e25fSSatish Balay     ib          = aj + *ai;
967831a3094SHong Zhang     jmin        = 0;
96898c9bda7SSatish Balay     nonzerorow += (n>0);
9697fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
97049b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
97149b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
97249b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
97349b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
97449b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
975831a3094SHong Zhang       v        += 25; jmin++;
9767fbae186SHong Zhang     }
977444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
978444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
979831a3094SHong Zhang     for (j=jmin; j<n; j++) {
98049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
98149b5e25fSSatish Balay       cval       = ib[j]*5;
98249b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
98349b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
98449b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
98549b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
98649b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
98749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
98849b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
98949b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
99049b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
99149b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
99249b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
99349b5e25fSSatish Balay       v        += 25;
99449b5e25fSSatish Balay     }
99549b5e25fSSatish Balay     xb +=5; ai++;
99649b5e25fSSatish Balay   }
99749b5e25fSSatish Balay 
998d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
999bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
100049b5e25fSSatish Balay 
1001dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
100249b5e25fSSatish Balay   PetscFunctionReturn(0);
100349b5e25fSSatish Balay }
10044a2ae208SSatish Balay #undef __FUNCT__
10054a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
1006dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
100749b5e25fSSatish Balay {
100849b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1009d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
1010d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
1011d9ca1df4SBarry Smith   const MatScalar   *v;
10126849ba73SBarry Smith   PetscErrorCode    ierr;
1013d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1014d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
101598c9bda7SSatish Balay   PetscInt          nonzerorow=0;
101649b5e25fSSatish Balay 
101749b5e25fSSatish Balay   PetscFunctionBegin;
1018343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1019d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
10201ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
102149b5e25fSSatish Balay 
102249b5e25fSSatish Balay   v  = a->a;
102349b5e25fSSatish Balay   xb = x;
102449b5e25fSSatish Balay 
102549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
102649b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
102749b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
102849b5e25fSSatish Balay     ib          = aj + *ai;
1029831a3094SHong Zhang     jmin        = 0;
103098c9bda7SSatish Balay     nonzerorow += (n>0);
10317fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
103249b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
103349b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
103449b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
103549b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
103649b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
103749b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1038831a3094SHong Zhang       v        += 36; jmin++;
10397fbae186SHong Zhang     }
1040444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1041444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1042831a3094SHong Zhang     for (j=jmin; j<n; j++) {
104349b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
104449b5e25fSSatish Balay       cval       = ib[j]*6;
104549b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
104649b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
104749b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
104849b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
104949b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
105049b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
105149b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
105249b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
105349b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
105449b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
105549b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
105649b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
105749b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
105849b5e25fSSatish Balay       v        += 36;
105949b5e25fSSatish Balay     }
106049b5e25fSSatish Balay     xb +=6; ai++;
106149b5e25fSSatish Balay   }
106249b5e25fSSatish Balay 
1063d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1064bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
106549b5e25fSSatish Balay 
1066dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
106749b5e25fSSatish Balay   PetscFunctionReturn(0);
106849b5e25fSSatish Balay }
106949b5e25fSSatish Balay 
10704a2ae208SSatish Balay #undef __FUNCT__
10714a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1072dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
107349b5e25fSSatish Balay {
107449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1075d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1076d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
1077d9ca1df4SBarry Smith   const MatScalar   *v;
10786849ba73SBarry Smith   PetscErrorCode    ierr;
1079d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1080d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
108198c9bda7SSatish Balay   PetscInt          nonzerorow=0;
108249b5e25fSSatish Balay 
108349b5e25fSSatish Balay   PetscFunctionBegin;
1084343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1085d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
10861ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
108749b5e25fSSatish Balay 
108849b5e25fSSatish Balay   v  = a->a;
108949b5e25fSSatish Balay   xb = x;
109049b5e25fSSatish Balay 
109149b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
109249b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
109349b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
109449b5e25fSSatish Balay     ib          = aj + *ai;
1095831a3094SHong Zhang     jmin        = 0;
109698c9bda7SSatish Balay     nonzerorow += (n>0);
10977fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
109849b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
109949b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
110049b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
110149b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
110249b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
110349b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
110449b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1105831a3094SHong Zhang       v        += 49; jmin++;
11067fbae186SHong Zhang     }
1107444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1108444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1109831a3094SHong Zhang     for (j=jmin; j<n; j++) {
111049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
111149b5e25fSSatish Balay       cval       = ib[j]*7;
111249b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
111349b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
111449b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
111549b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
111649b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
111749b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
111849b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
111949b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
112049b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
112149b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
112249b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
112349b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
112449b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
112549b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
112649b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
112749b5e25fSSatish Balay       v       += 49;
112849b5e25fSSatish Balay     }
112949b5e25fSSatish Balay     xb +=7; ai++;
113049b5e25fSSatish Balay   }
113149b5e25fSSatish Balay 
1132d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1133bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
113449b5e25fSSatish Balay 
1135dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
113649b5e25fSSatish Balay   PetscFunctionReturn(0);
113749b5e25fSSatish Balay }
113849b5e25fSSatish Balay 
11394a2ae208SSatish Balay #undef __FUNCT__
11404a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1141dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
114249b5e25fSSatish Balay {
114349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1144d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1145d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
1146d9ca1df4SBarry Smith   const MatScalar   *v;
1147dfbe8321SBarry Smith   PetscErrorCode    ierr;
1148d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1149d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
115098c9bda7SSatish Balay   PetscInt          nonzerorow=0;
115149b5e25fSSatish Balay 
115249b5e25fSSatish Balay   PetscFunctionBegin;
1153343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1154d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
11551ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
115649b5e25fSSatish Balay 
115749b5e25fSSatish Balay   aj = a->j;
115849b5e25fSSatish Balay   v  = a->a;
115949b5e25fSSatish Balay   ii = a->i;
116049b5e25fSSatish Balay 
116149b5e25fSSatish Balay   if (!a->mult_work) {
1162854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
116349b5e25fSSatish Balay   }
116449b5e25fSSatish Balay   work = a->mult_work;
116549b5e25fSSatish Balay 
116649b5e25fSSatish Balay 
116749b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
116849b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
116949b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
117098c9bda7SSatish Balay     nonzerorow += (n>0);
117149b5e25fSSatish Balay 
117249b5e25fSSatish Balay     /* upper triangular part */
117349b5e25fSSatish Balay     for (j=0; j<n; j++) {
117449b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
117549b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
117649b5e25fSSatish Balay       workt += bs;
117749b5e25fSSatish Balay     }
117849b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
117996b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
118049b5e25fSSatish Balay 
118149b5e25fSSatish Balay     /* strict lower triangular part */
1182831a3094SHong Zhang     idx = aj+ii[0];
1183831a3094SHong Zhang     if (*idx == i) {
118496b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
1185831a3094SHong Zhang     }
118649b5e25fSSatish Balay     if (ncols > 0) {
118749b5e25fSSatish Balay       workt = work;
118887828ca2SBarry Smith       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
118996b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1190831a3094SHong Zhang       for (j=0; j<n; j++) {
1191831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
119249b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
119349b5e25fSSatish Balay         workt += bs;
119449b5e25fSSatish Balay       }
119549b5e25fSSatish Balay     }
119649b5e25fSSatish Balay 
119749b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
119849b5e25fSSatish Balay   }
119949b5e25fSSatish Balay 
1200d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1201bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
120249b5e25fSSatish Balay 
1203dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
120449b5e25fSSatish Balay   PetscFunctionReturn(0);
120549b5e25fSSatish Balay }
120649b5e25fSSatish Balay 
12074a2ae208SSatish Balay #undef __FUNCT__
12084a2ae208SSatish Balay #define __FUNCT__ "MatScale_SeqSBAIJ"
1209f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
121049b5e25fSSatish Balay {
121149b5e25fSSatish Balay   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1212f4df32b1SMatthew Knepley   PetscScalar    oalpha = alpha;
1213efee365bSSatish Balay   PetscErrorCode ierr;
1214c5df96a5SBarry Smith   PetscBLASInt   one = 1,totalnz;
121549b5e25fSSatish Balay 
121649b5e25fSSatish Balay   PetscFunctionBegin;
1217c5df96a5SBarry Smith   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
12188b83055fSJed Brown   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1219efee365bSSatish Balay   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
122049b5e25fSSatish Balay   PetscFunctionReturn(0);
122149b5e25fSSatish Balay }
122249b5e25fSSatish Balay 
12234a2ae208SSatish Balay #undef __FUNCT__
12244a2ae208SSatish Balay #define __FUNCT__ "MatNorm_SeqSBAIJ"
1225dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
122649b5e25fSSatish Balay {
122749b5e25fSSatish Balay   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1228d9ca1df4SBarry Smith   const MatScalar *v       = a->a;
122949b5e25fSSatish Balay   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1230d9ca1df4SBarry Smith   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
12316849ba73SBarry Smith   PetscErrorCode  ierr;
1232d9ca1df4SBarry Smith   const PetscInt  *aj=a->j,*col;
123349b5e25fSSatish Balay 
123449b5e25fSSatish Balay   PetscFunctionBegin;
123549b5e25fSSatish Balay   if (type == NORM_FROBENIUS) {
123649b5e25fSSatish Balay     for (k=0; k<mbs; k++) {
123749b5e25fSSatish Balay       jmin = a->i[k]; jmax = a->i[k+1];
1238831a3094SHong Zhang       col  = aj + jmin;
1239831a3094SHong Zhang       if (*col == k) {         /* diagonal block */
124049b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
124149b5e25fSSatish Balay           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
124249b5e25fSSatish Balay         }
1243831a3094SHong Zhang         jmin++;
1244831a3094SHong Zhang       }
1245831a3094SHong Zhang       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
124649b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
124749b5e25fSSatish Balay           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
124849b5e25fSSatish Balay         }
124949b5e25fSSatish Balay       }
125049b5e25fSSatish Balay     }
12518f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
125251f70360SJed Brown     ierr = PetscLogFlops(2*bs2*a->nz);CHKERRQ(ierr);
12530b8dc8d2SHong Zhang   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1254dcca6d9dSJed Brown     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
12550b8dc8d2SHong Zhang     for (i=0; i<mbs; i++) jl[i] = mbs;
12560b8dc8d2SHong Zhang     il[0] = 0;
125749b5e25fSSatish Balay 
125849b5e25fSSatish Balay     *norm = 0.0;
125949b5e25fSSatish Balay     for (k=0; k<mbs; k++) { /* k_th block row */
126049b5e25fSSatish Balay       for (j=0; j<bs; j++) sum[j]=0.0;
126149b5e25fSSatish Balay       /*-- col sum --*/
126249b5e25fSSatish Balay       i = jl[k]; /* first |A(i,k)| to be added */
1263831a3094SHong Zhang       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1264831a3094SHong Zhang                   at step k */
126549b5e25fSSatish Balay       while (i<mbs) {
126649b5e25fSSatish Balay         nexti = jl[i];  /* next block row to be added */
126749b5e25fSSatish Balay         ik    = il[i];  /* block index of A(i,k) in the array a */
126849b5e25fSSatish Balay         for (j=0; j<bs; j++) {
126949b5e25fSSatish Balay           v = a->a + ik*bs2 + j*bs;
127049b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
127149b5e25fSSatish Balay             sum[j] += PetscAbsScalar(*v); v++;
127249b5e25fSSatish Balay           }
127349b5e25fSSatish Balay         }
127449b5e25fSSatish Balay         /* update il, jl */
1275831a3094SHong Zhang         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1276831a3094SHong Zhang         jmax = a->i[i+1];
127749b5e25fSSatish Balay         if (jmin < jmax) {
127849b5e25fSSatish Balay           il[i] = jmin;
127949b5e25fSSatish Balay           j     = a->j[jmin];
128049b5e25fSSatish Balay           jl[i] = jl[j]; jl[j]=i;
128149b5e25fSSatish Balay         }
128249b5e25fSSatish Balay         i = nexti;
128349b5e25fSSatish Balay       }
128449b5e25fSSatish Balay       /*-- row sum --*/
128549b5e25fSSatish Balay       jmin = a->i[k]; jmax = a->i[k+1];
128649b5e25fSSatish Balay       for (i=jmin; i<jmax; i++) {
128749b5e25fSSatish Balay         for (j=0; j<bs; j++) {
128849b5e25fSSatish Balay           v = a->a + i*bs2 + j;
128949b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
12900b8dc8d2SHong Zhang             sum[j] += PetscAbsScalar(*v); v += bs;
129149b5e25fSSatish Balay           }
129249b5e25fSSatish Balay         }
129349b5e25fSSatish Balay       }
129449b5e25fSSatish Balay       /* add k_th block row to il, jl */
1295831a3094SHong Zhang       col = aj+jmin;
1296831a3094SHong Zhang       if (*col == k) jmin++;
129749b5e25fSSatish Balay       if (jmin < jmax) {
129849b5e25fSSatish Balay         il[k] = jmin;
12990b8dc8d2SHong Zhang         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
130049b5e25fSSatish Balay       }
130149b5e25fSSatish Balay       for (j=0; j<bs; j++) {
130249b5e25fSSatish Balay         if (sum[j] > *norm) *norm = sum[j];
130349b5e25fSSatish Balay       }
130449b5e25fSSatish Balay     }
130574ed9c26SBarry Smith     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
130651f70360SJed Brown     ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr);
1307f23aa3ddSBarry Smith   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
130849b5e25fSSatish Balay   PetscFunctionReturn(0);
130949b5e25fSSatish Balay }
131049b5e25fSSatish Balay 
13114a2ae208SSatish Balay #undef __FUNCT__
13124a2ae208SSatish Balay #define __FUNCT__ "MatEqual_SeqSBAIJ"
1313ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
131449b5e25fSSatish Balay {
131549b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1316dfbe8321SBarry Smith   PetscErrorCode ierr;
131749b5e25fSSatish Balay 
131849b5e25fSSatish Balay   PetscFunctionBegin;
131949b5e25fSSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1320d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1321ef511fbeSHong Zhang     *flg = PETSC_FALSE;
1322ef511fbeSHong Zhang     PetscFunctionReturn(0);
132349b5e25fSSatish Balay   }
132449b5e25fSSatish Balay 
132549b5e25fSSatish Balay   /* if the a->i are the same */
132613f74950SBarry Smith   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
132726fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
132849b5e25fSSatish Balay 
132949b5e25fSSatish Balay   /* if a->j are the same */
133013f74950SBarry Smith   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
133126fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
133226fbe8dcSKarl Rupp 
133349b5e25fSSatish Balay   /* if a->a are the same */
1334d0f46423SBarry Smith   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1335935af2e7SHong Zhang   PetscFunctionReturn(0);
133649b5e25fSSatish Balay }
133749b5e25fSSatish Balay 
13384a2ae208SSatish Balay #undef __FUNCT__
13394a2ae208SSatish Balay #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1340dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
134149b5e25fSSatish Balay {
134249b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1343dfbe8321SBarry Smith   PetscErrorCode  ierr;
1344d9ca1df4SBarry Smith   PetscInt        i,j,k,row,bs,ambs,bs2;
1345d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
134687828ca2SBarry Smith   PetscScalar     *x,zero = 0.0;
1347d9ca1df4SBarry Smith   const MatScalar *aa,*aa_j;
134849b5e25fSSatish Balay 
134949b5e25fSSatish Balay   PetscFunctionBegin;
1350d0f46423SBarry Smith   bs = A->rmap->bs;
1351e32f2f54SBarry Smith   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
135282799104SHong Zhang 
135349b5e25fSSatish Balay   aa   = a->a;
13548a0c6efdSHong Zhang   ambs = a->mbs;
13558a0c6efdSHong Zhang 
13568a0c6efdSHong Zhang   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
13578a0c6efdSHong Zhang     PetscInt *diag=a->diag;
13588a0c6efdSHong Zhang     aa   = a->a;
13598a0c6efdSHong Zhang     ambs = a->mbs;
13608a0c6efdSHong Zhang     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
13618a0c6efdSHong Zhang     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
13628a0c6efdSHong Zhang     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
13638a0c6efdSHong Zhang     PetscFunctionReturn(0);
13648a0c6efdSHong Zhang   }
13658a0c6efdSHong Zhang 
136649b5e25fSSatish Balay   ai   = a->i;
136749b5e25fSSatish Balay   aj   = a->j;
136849b5e25fSSatish Balay   bs2  = a->bs2;
13692dcb1b2aSMatthew Knepley   ierr = VecSet(v,zero);CHKERRQ(ierr);
13701ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
137149b5e25fSSatish Balay   for (i=0; i<ambs; i++) {
137249b5e25fSSatish Balay     j=ai[i];
137349b5e25fSSatish Balay     if (aj[j] == i) {    /* if this is a diagonal element */
137449b5e25fSSatish Balay       row  = i*bs;
137549b5e25fSSatish Balay       aa_j = aa + j*bs2;
137649b5e25fSSatish Balay       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
137749b5e25fSSatish Balay     }
137849b5e25fSSatish Balay   }
13791ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
138049b5e25fSSatish Balay   PetscFunctionReturn(0);
138149b5e25fSSatish Balay }
138249b5e25fSSatish Balay 
13834a2ae208SSatish Balay #undef __FUNCT__
13844a2ae208SSatish Balay #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1385dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
138649b5e25fSSatish Balay {
138749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1388d9ca1df4SBarry Smith   PetscScalar       x;
1389d9ca1df4SBarry Smith   const PetscScalar *l,*li,*ri;
139049b5e25fSSatish Balay   MatScalar         *aa,*v;
1391dfbe8321SBarry Smith   PetscErrorCode    ierr;
13925e90f9d9SHong Zhang   PetscInt          i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1393ace3abfcSBarry Smith   PetscBool         flg;
139449b5e25fSSatish Balay 
139549b5e25fSSatish Balay   PetscFunctionBegin;
1396b3bf805bSHong Zhang   if (ll != rr) {
1397b3bf805bSHong Zhang     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1398e7e72b3dSBarry Smith     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1399b3bf805bSHong Zhang   }
1400b3bf805bSHong Zhang   if (!ll) PetscFunctionReturn(0);
140149b5e25fSSatish Balay   ai  = a->i;
140249b5e25fSSatish Balay   aj  = a->j;
140349b5e25fSSatish Balay   aa  = a->a;
1404d0f46423SBarry Smith   m   = A->rmap->N;
1405d0f46423SBarry Smith   bs  = A->rmap->bs;
140649b5e25fSSatish Balay   mbs = a->mbs;
140749b5e25fSSatish Balay   bs2 = a->bs2;
140849b5e25fSSatish Balay 
1409d9ca1df4SBarry Smith   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
141049b5e25fSSatish Balay   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1411e32f2f54SBarry Smith   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
141249b5e25fSSatish Balay   for (i=0; i<mbs; i++) { /* for each block row */
141349b5e25fSSatish Balay     M  = ai[i+1] - ai[i];
141449b5e25fSSatish Balay     li = l + i*bs;
141549b5e25fSSatish Balay     v  = aa + bs2*ai[i];
141649b5e25fSSatish Balay     for (j=0; j<M; j++) { /* for each block */
141749b5e25fSSatish Balay       ri = l + bs*aj[ai[i]+j];
14185e90f9d9SHong Zhang       for (k=0; k<bs; k++) {
141949b5e25fSSatish Balay         x = ri[k];
142049b5e25fSSatish Balay         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
142149b5e25fSSatish Balay       }
142249b5e25fSSatish Balay     }
142349b5e25fSSatish Balay   }
1424d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1425dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
142649b5e25fSSatish Balay   PetscFunctionReturn(0);
142749b5e25fSSatish Balay }
142849b5e25fSSatish Balay 
14294a2ae208SSatish Balay #undef __FUNCT__
14304a2ae208SSatish Balay #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1431dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
143249b5e25fSSatish Balay {
143349b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
143449b5e25fSSatish Balay 
143549b5e25fSSatish Balay   PetscFunctionBegin;
143649b5e25fSSatish Balay   info->block_size   = a->bs2;
1437ceed8ce5SJed Brown   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
14386c6c5352SBarry Smith   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
143949b5e25fSSatish Balay   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
144049b5e25fSSatish Balay   info->assemblies   = A->num_ass;
14418e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
14427adad957SLisandro Dalcin   info->memory       = ((PetscObject)A)->mem;
1443d5f3da31SBarry Smith   if (A->factortype) {
144449b5e25fSSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
144549b5e25fSSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
144649b5e25fSSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
144749b5e25fSSatish Balay   } else {
144849b5e25fSSatish Balay     info->fill_ratio_given  = 0;
144949b5e25fSSatish Balay     info->fill_ratio_needed = 0;
145049b5e25fSSatish Balay     info->factor_mallocs    = 0;
145149b5e25fSSatish Balay   }
145249b5e25fSSatish Balay   PetscFunctionReturn(0);
145349b5e25fSSatish Balay }
145449b5e25fSSatish Balay 
145549b5e25fSSatish Balay 
14564a2ae208SSatish Balay #undef __FUNCT__
14574a2ae208SSatish Balay #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1458dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
145949b5e25fSSatish Balay {
146049b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1461dfbe8321SBarry Smith   PetscErrorCode ierr;
146249b5e25fSSatish Balay 
146349b5e25fSSatish Balay   PetscFunctionBegin;
146449b5e25fSSatish Balay   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
146549b5e25fSSatish Balay   PetscFunctionReturn(0);
146649b5e25fSSatish Balay }
1467dc354874SHong Zhang 
14684a2ae208SSatish Balay #undef __FUNCT__
1469985db425SBarry Smith #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1470985db425SBarry Smith /*
1471985db425SBarry Smith    This code does not work since it only checks the upper triangular part of
1472985db425SBarry Smith   the matrix. Hence it is not listed in the function table.
1473985db425SBarry Smith */
1474985db425SBarry Smith PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1475dc354874SHong Zhang {
1476dc354874SHong Zhang   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1477dfbe8321SBarry Smith   PetscErrorCode  ierr;
1478d9ca1df4SBarry Smith   PetscInt        i,j,n,row,col,bs,mbs;
1479d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
1480c3fca9a7SHong Zhang   PetscReal       atmp;
1481d9ca1df4SBarry Smith   const MatScalar *aa;
1482985db425SBarry Smith   PetscScalar     *x;
148313f74950SBarry Smith   PetscInt        ncols,brow,bcol,krow,kcol;
1484dc354874SHong Zhang 
1485dc354874SHong Zhang   PetscFunctionBegin;
1486e32f2f54SBarry Smith   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1487e32f2f54SBarry Smith   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1488d0f46423SBarry Smith   bs  = A->rmap->bs;
1489dc354874SHong Zhang   aa  = a->a;
1490dc354874SHong Zhang   ai  = a->i;
1491dc354874SHong Zhang   aj  = a->j;
149244117c81SHong Zhang   mbs = a->mbs;
1493dc354874SHong Zhang 
1494985db425SBarry Smith   ierr = VecSet(v,0.0);CHKERRQ(ierr);
14951ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1496dc354874SHong Zhang   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1497e32f2f54SBarry Smith   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
149844117c81SHong Zhang   for (i=0; i<mbs; i++) {
1499d0f6400bSHong Zhang     ncols = ai[1] - ai[0]; ai++;
1500d0f6400bSHong Zhang     brow  = bs*i;
150144117c81SHong Zhang     for (j=0; j<ncols; j++) {
1502d0f6400bSHong Zhang       bcol = bs*(*aj);
150344117c81SHong Zhang       for (kcol=0; kcol<bs; kcol++) {
1504d0f6400bSHong Zhang         col = bcol + kcol;      /* col index */
150544117c81SHong Zhang         for (krow=0; krow<bs; krow++) {
1506d0f6400bSHong Zhang           atmp = PetscAbsScalar(*aa); aa++;
1507d0f6400bSHong Zhang           row  = brow + krow;   /* row index */
1508c3fca9a7SHong Zhang           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1509c3fca9a7SHong Zhang           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
151044117c81SHong Zhang         }
151144117c81SHong Zhang       }
1512d0f6400bSHong Zhang       aj++;
1513dc354874SHong Zhang     }
1514dc354874SHong Zhang   }
15151ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1516dc354874SHong Zhang   PetscFunctionReturn(0);
1517dc354874SHong Zhang }
1518