xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision afebec48acb8b1040c5bfa54d0857ffcc0c220e6)
149b5e25fSSatish Balay 
2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
3af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
4c6db04a5SJed Brown #include <petscbt.h>
5c6db04a5SJed Brown #include <../src/mat/impls/sbaij/seq/sbaij.h>
6c6db04a5SJed Brown #include <petscblaslapack.h>
749b5e25fSSatish Balay 
84a2ae208SSatish Balay #undef __FUNCT__
94a2ae208SSatish Balay #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
1013f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
1149b5e25fSSatish Balay {
125eee224dSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
136849ba73SBarry Smith   PetscErrorCode ierr;
145d0c19d7SBarry Smith   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
155d0c19d7SBarry Smith   const PetscInt *idx;
16db41cccfSHong Zhang   PetscBT        table_out,table_in;
17d94109b8SHong Zhang 
18d94109b8SHong Zhang   PetscFunctionBegin;
19e32f2f54SBarry Smith   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20d94109b8SHong Zhang   mbs  = a->mbs;
21d94109b8SHong Zhang   ai   = a->i;
22d94109b8SHong Zhang   aj   = a->j;
23d0f46423SBarry Smith   bs   = A->rmap->bs;
2453b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
25854ce69bSBarry Smith   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
26854ce69bSBarry Smith   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
2753b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
28d94109b8SHong Zhang 
29d94109b8SHong Zhang   for (i=0; i<is_max; i++) { /* for each is */
30d94109b8SHong Zhang     isz  = 0;
31db41cccfSHong Zhang     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
32d94109b8SHong Zhang 
33d94109b8SHong Zhang     /* Extract the indices, assume there can be duplicate entries */
34d94109b8SHong Zhang     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35d94109b8SHong Zhang     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36d94109b8SHong Zhang 
37db41cccfSHong Zhang     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
38dbe03f88SHong Zhang     bcol_max = 0;
39d94109b8SHong Zhang     for (j=0; j<n; ++j) {
40d94109b8SHong Zhang       brow = idx[j]/bs; /* convert the indices into block indices */
41e32f2f54SBarry Smith       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42db41cccfSHong Zhang       if (!PetscBTLookupSet(table_out,brow)) {
43dbe03f88SHong Zhang         nidx[isz++] = brow;
44dbe03f88SHong Zhang         if (bcol_max < brow) bcol_max = brow;
45dbe03f88SHong Zhang       }
46d94109b8SHong Zhang     }
47d94109b8SHong Zhang     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
486bf464f9SBarry Smith     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
49d94109b8SHong Zhang 
50d94109b8SHong Zhang     k = 0;
51d94109b8SHong Zhang     for (j=0; j<ov; j++) { /* for each overlap */
52db41cccfSHong Zhang       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53db41cccfSHong Zhang       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
54db41cccfSHong Zhang       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
55d94109b8SHong Zhang 
56d94109b8SHong Zhang       n = isz;  /* length of the updated is[i] */
57d94109b8SHong Zhang       for (brow=0; brow<mbs; brow++) {
58d94109b8SHong Zhang         start = ai[brow]; end   = ai[brow+1];
59db41cccfSHong Zhang         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
60d94109b8SHong Zhang           for (l = start; l<end; l++) {
61d94109b8SHong Zhang             bcol = aj[l];
62d7b97159SHong Zhang             if (!PetscBTLookupSet(table_out,bcol)) {
63d7b97159SHong Zhang               nidx[isz++] = bcol;
64d7b97159SHong Zhang               if (bcol_max < bcol) bcol_max = bcol;
65d7b97159SHong Zhang             }
66d94109b8SHong Zhang           }
67d94109b8SHong Zhang           k++;
68d94109b8SHong Zhang           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
69d94109b8SHong Zhang         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
70d94109b8SHong Zhang           for (l = start; l<end; l++) {
71d94109b8SHong Zhang             bcol = aj[l];
72dbe03f88SHong Zhang             if (bcol > bcol_max) break;
73db41cccfSHong Zhang             if (PetscBTLookup(table_in,bcol)) {
7426fbe8dcSKarl Rupp               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
75d94109b8SHong Zhang               break; /* for l = start; l<end ; l++) */
76d94109b8SHong Zhang             }
77d94109b8SHong Zhang           }
78d94109b8SHong Zhang         }
79d94109b8SHong Zhang       }
80d94109b8SHong Zhang     } /* for each overlap */
81d94109b8SHong Zhang 
82d94109b8SHong Zhang     /* expand the Index Set */
83d94109b8SHong Zhang     for (j=0; j<isz; j++) {
8426fbe8dcSKarl Rupp       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
85d94109b8SHong Zhang     }
8670b3c8c7SBarry Smith     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
87d94109b8SHong Zhang   }
8894bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
89d94109b8SHong Zhang   ierr = PetscFree(nidx);CHKERRQ(ierr);
90d94109b8SHong Zhang   ierr = PetscFree(nidx2);CHKERRQ(ierr);
9194bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
925eee224dSHong Zhang   PetscFunctionReturn(0);
9349b5e25fSSatish Balay }
9449b5e25fSSatish Balay 
954a2ae208SSatish Balay #undef __FUNCT__
964a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
9755d14e7dSHong Zhang PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,MatReuse scall,Mat *B)
9849b5e25fSSatish Balay {
9949b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data,*c;
1006849ba73SBarry Smith   PetscErrorCode  ierr;
10113f74950SBarry Smith   PetscInt        *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
10213f74950SBarry Smith   PetscInt        row,mat_i,*mat_j,tcol,*mat_ilen;
1035d0c19d7SBarry Smith   PetscInt        nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
1045d0c19d7SBarry Smith   const PetscInt  *irow,*aj = a->j,*ai = a->i;
10549b5e25fSSatish Balay   MatScalar       *mat_a;
10649b5e25fSSatish Balay   Mat             C;
107ace3abfcSBarry Smith   PetscBool       flag,sorted;
10849b5e25fSSatish Balay 
10949b5e25fSSatish Balay   PetscFunctionBegin;
11055d14e7dSHong Zhang   ierr = ISSorted(isrow,&sorted);CHKERRQ(ierr);
11155d14e7dSHong Zhang   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
11249b5e25fSSatish Balay 
11349b5e25fSSatish Balay   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
11449b5e25fSSatish Balay   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
11549b5e25fSSatish Balay 
116785e854fSJed Brown   ierr  = PetscMalloc1(oldcols,&smap);CHKERRQ(ierr);
11774ed9c26SBarry Smith   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
11849b5e25fSSatish Balay   ssmap = smap;
119854ce69bSBarry Smith   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
12049b5e25fSSatish Balay   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
12149b5e25fSSatish Balay   /* determine lens of each row */
12249b5e25fSSatish Balay   for (i=0; i<nrows; i++) {
12349b5e25fSSatish Balay     kstart  = ai[irow[i]];
12449b5e25fSSatish Balay     kend    = kstart + a->ilen[irow[i]];
12549b5e25fSSatish Balay     lens[i] = 0;
12649b5e25fSSatish Balay     for (k=kstart; k<kend; k++) {
12726fbe8dcSKarl Rupp       if (ssmap[aj[k]]) lens[i]++;
12849b5e25fSSatish Balay     }
12949b5e25fSSatish Balay   }
13049b5e25fSSatish Balay   /* Create and fill new matrix */
13149b5e25fSSatish Balay   if (scall == MAT_REUSE_MATRIX) {
13249b5e25fSSatish Balay     c = (Mat_SeqSBAIJ*)((*B)->data);
13349b5e25fSSatish Balay 
134e32f2f54SBarry Smith     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
13513f74950SBarry Smith     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
136e7e72b3dSBarry Smith     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
13713f74950SBarry Smith     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
13849b5e25fSSatish Balay     C    = *B;
13949b5e25fSSatish Balay   } else {
140ce94432eSBarry Smith     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
141f69a0ea3SMatthew Knepley     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1427adad957SLisandro Dalcin     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
143ab93d7beSBarry Smith     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
14449b5e25fSSatish Balay   }
14549b5e25fSSatish Balay   c = (Mat_SeqSBAIJ*)(C->data);
14649b5e25fSSatish Balay   for (i=0; i<nrows; i++) {
14749b5e25fSSatish Balay     row      = irow[i];
14849b5e25fSSatish Balay     kstart   = ai[row];
14949b5e25fSSatish Balay     kend     = kstart + a->ilen[row];
15049b5e25fSSatish Balay     mat_i    = c->i[i];
15149b5e25fSSatish Balay     mat_j    = c->j + mat_i;
15249b5e25fSSatish Balay     mat_a    = c->a + mat_i*bs2;
15349b5e25fSSatish Balay     mat_ilen = c->ilen + i;
15449b5e25fSSatish Balay     for (k=kstart; k<kend; k++) {
15549b5e25fSSatish Balay       if ((tcol=ssmap[a->j[k]])) {
15649b5e25fSSatish Balay         *mat_j++ = tcol - 1;
15749b5e25fSSatish Balay         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
15849b5e25fSSatish Balay         mat_a   += bs2;
15949b5e25fSSatish Balay         (*mat_ilen)++;
16049b5e25fSSatish Balay       }
16149b5e25fSSatish Balay     }
16249b5e25fSSatish Balay   }
16349b5e25fSSatish Balay 
16449b5e25fSSatish Balay   /* Free work space */
16549b5e25fSSatish Balay   ierr = PetscFree(smap);CHKERRQ(ierr);
16649b5e25fSSatish Balay   ierr = PetscFree(lens);CHKERRQ(ierr);
16749b5e25fSSatish Balay   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
16849b5e25fSSatish Balay   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
16949b5e25fSSatish Balay 
17049b5e25fSSatish Balay   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
17149b5e25fSSatish Balay   *B   = C;
17249b5e25fSSatish Balay   PetscFunctionReturn(0);
17349b5e25fSSatish Balay }
17449b5e25fSSatish Balay 
1754a2ae208SSatish Balay #undef __FUNCT__
1764a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
1774aa3045dSJed Brown PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
17849b5e25fSSatish Balay {
17949b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
18049b5e25fSSatish Balay   IS             is1;
1816849ba73SBarry Smith   PetscErrorCode ierr;
1825d0c19d7SBarry Smith   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
1835d0c19d7SBarry Smith   const PetscInt *irow;
18449b5e25fSSatish Balay 
18549b5e25fSSatish Balay   PetscFunctionBegin;
1868f46ffcaSHong Zhang   if (isrow != iscol) {
1878f46ffcaSHong Zhang     PetscBool isequal;
1888f46ffcaSHong Zhang     ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
1896c4ed002SBarry Smith     if (!isequal) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isrow");
1908f46ffcaSHong Zhang   }
19149b5e25fSSatish Balay 
19255d14e7dSHong Zhang   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
19349b5e25fSSatish Balay   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
19449b5e25fSSatish Balay 
19549b5e25fSSatish Balay   /* Verify if the indices corespond to each element in a block
19655d14e7dSHong Zhang    and form the IS with compressed IS */
197dcca6d9dSJed Brown   ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr);
19874ed9c26SBarry Smith   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
19949b5e25fSSatish Balay   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
20049b5e25fSSatish Balay 
20149b5e25fSSatish Balay   count = 0;
20249b5e25fSSatish Balay   for (i=0; i<a->mbs; i++) {
203e32f2f54SBarry Smith     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
20449b5e25fSSatish Balay     if (vary[i]==bs) iary[count++] = i;
20549b5e25fSSatish Balay   }
20649b5e25fSSatish Balay   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
20770b3c8c7SBarry Smith   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
20874ed9c26SBarry Smith   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
20949b5e25fSSatish Balay 
21055d14e7dSHong Zhang   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,scall,B);CHKERRQ(ierr);
2116bf464f9SBarry Smith   ierr = ISDestroy(&is1);CHKERRQ(ierr);
21249b5e25fSSatish Balay   PetscFunctionReturn(0);
21349b5e25fSSatish Balay }
21449b5e25fSSatish Balay 
2154a2ae208SSatish Balay #undef __FUNCT__
2164a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
21713f74950SBarry Smith PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
21849b5e25fSSatish Balay {
2196849ba73SBarry Smith   PetscErrorCode ierr;
22013f74950SBarry Smith   PetscInt       i;
221*afebec48SHong Zhang   PetscBool      flg;
22249b5e25fSSatish Balay 
22349b5e25fSSatish Balay   PetscFunctionBegin;
224*afebec48SHong Zhang   ierr = MatGetSubMatrices_SeqBAIJ(A,n,irow,icol,scall,B);CHKERRQ(ierr);
22549b5e25fSSatish Balay   for (i=0; i<n; i++) {
226*afebec48SHong Zhang     ierr = ISEqual(irow[i],icol[i],&flg);CHKERRQ(ierr);
227*afebec48SHong Zhang     if (!flg) {
228*afebec48SHong Zhang       Mat Bseq = *B[i];
229*afebec48SHong Zhang       /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
230*afebec48SHong Zhang          Zero some ops' to avoid invalid usse */
231*afebec48SHong Zhang       ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr);
232*afebec48SHong Zhang       ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr);
233*afebec48SHong Zhang       Bseq->ops->mult                   = 0;
234*afebec48SHong Zhang       Bseq->ops->multadd                = 0;
235*afebec48SHong Zhang       Bseq->ops->multtranspose          = 0;
236*afebec48SHong Zhang       Bseq->ops->multtransposeadd       = 0;
237*afebec48SHong Zhang       Bseq->ops->lufactor               = 0;
238*afebec48SHong Zhang       Bseq->ops->choleskyfactor         = 0;
239*afebec48SHong Zhang       Bseq->ops->lufactorsymbolic       = 0;
240*afebec48SHong Zhang       Bseq->ops->choleskyfactorsymbolic = 0;
241*afebec48SHong Zhang       Bseq->ops->getinertia             = 0;
242*afebec48SHong Zhang     }
24349b5e25fSSatish Balay   }
24449b5e25fSSatish Balay   PetscFunctionReturn(0);
24549b5e25fSSatish Balay }
24649b5e25fSSatish Balay 
24749b5e25fSSatish Balay /* -------------------------------------------------------*/
24849b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */
24949b5e25fSSatish Balay /* -------------------------------------------------------*/
25049b5e25fSSatish Balay 
2514a2ae208SSatish Balay #undef __FUNCT__
2524a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_2"
253dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
25449b5e25fSSatish Balay {
25549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
256d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,zero=0.0;
257d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
258d9ca1df4SBarry Smith   const MatScalar   *v;
2596849ba73SBarry Smith   PetscErrorCode    ierr;
260d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
261d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
26298c9bda7SSatish Balay   PetscInt          nonzerorow=0;
26349b5e25fSSatish Balay 
26449b5e25fSSatish Balay   PetscFunctionBegin;
2652dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
266d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2671ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
26849b5e25fSSatish Balay 
26949b5e25fSSatish Balay   v  = a->a;
27049b5e25fSSatish Balay   xb = x;
27149b5e25fSSatish Balay 
27249b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
27349b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
27449b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
27549b5e25fSSatish Balay     ib          = aj + *ai;
276831a3094SHong Zhang     jmin        = 0;
27798c9bda7SSatish Balay     nonzerorow += (n>0);
2787fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
27949b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
28049b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
281831a3094SHong Zhang       v        += 4; jmin++;
2827fbae186SHong Zhang     }
283444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
284444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
285831a3094SHong Zhang     for (j=jmin; j<n; j++) {
28649b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
28749b5e25fSSatish Balay       cval       = ib[j]*2;
28849b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
28949b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
29049b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
29149b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
29249b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
29349b5e25fSSatish Balay       v        += 4;
29449b5e25fSSatish Balay     }
29549b5e25fSSatish Balay     xb +=2; ai++;
29649b5e25fSSatish Balay   }
29749b5e25fSSatish Balay 
298d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2991ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
300dc0b31edSSatish Balay   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
30149b5e25fSSatish Balay   PetscFunctionReturn(0);
30249b5e25fSSatish Balay }
30349b5e25fSSatish Balay 
3044a2ae208SSatish Balay #undef __FUNCT__
3054a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_3"
306dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
30749b5e25fSSatish Balay {
30849b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
309d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,zero=0.0;
310d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
311d9ca1df4SBarry Smith   const MatScalar   *v;
3126849ba73SBarry Smith   PetscErrorCode    ierr;
313d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
314d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
31598c9bda7SSatish Balay   PetscInt          nonzerorow=0;
31649b5e25fSSatish Balay 
31749b5e25fSSatish Balay   PetscFunctionBegin;
3182dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
319d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3201ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
32149b5e25fSSatish Balay 
32249b5e25fSSatish Balay   v  = a->a;
32349b5e25fSSatish Balay   xb = x;
32449b5e25fSSatish Balay 
32549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
32649b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
32749b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
32849b5e25fSSatish Balay     ib          = aj + *ai;
329831a3094SHong Zhang     jmin        = 0;
33098c9bda7SSatish Balay     nonzerorow += (n>0);
3317fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
33249b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
33349b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
33449b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
335831a3094SHong Zhang       v        += 9; jmin++;
3367fbae186SHong Zhang     }
337444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
338444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
339831a3094SHong Zhang     for (j=jmin; j<n; j++) {
34049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
34149b5e25fSSatish Balay       cval       = ib[j]*3;
34249b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
34349b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
34449b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
34549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
34649b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
34749b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
34849b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
34949b5e25fSSatish Balay       v        += 9;
35049b5e25fSSatish Balay     }
35149b5e25fSSatish Balay     xb +=3; ai++;
35249b5e25fSSatish Balay   }
35349b5e25fSSatish Balay 
354d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
3551ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
356dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
35749b5e25fSSatish Balay   PetscFunctionReturn(0);
35849b5e25fSSatish Balay }
35949b5e25fSSatish Balay 
3604a2ae208SSatish Balay #undef __FUNCT__
3614a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_4"
362dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
36349b5e25fSSatish Balay {
36449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
365d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
366d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
367d9ca1df4SBarry Smith   const MatScalar   *v;
3686849ba73SBarry Smith   PetscErrorCode    ierr;
369d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
370d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
37198c9bda7SSatish Balay   PetscInt          nonzerorow = 0;
37249b5e25fSSatish Balay 
37349b5e25fSSatish Balay   PetscFunctionBegin;
3742dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
375d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3761ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
37749b5e25fSSatish Balay 
37849b5e25fSSatish Balay   v  = a->a;
37949b5e25fSSatish Balay   xb = x;
38049b5e25fSSatish Balay 
38149b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
38249b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
38349b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
38449b5e25fSSatish Balay     ib          = aj + *ai;
385831a3094SHong Zhang     jmin        = 0;
38698c9bda7SSatish Balay     nonzerorow += (n>0);
3877fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
38849b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
38949b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
39049b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
39149b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
392831a3094SHong Zhang       v        += 16; jmin++;
3937fbae186SHong Zhang     }
394444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
395444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
396831a3094SHong Zhang     for (j=jmin; j<n; j++) {
39749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
39849b5e25fSSatish Balay       cval       = ib[j]*4;
39949b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
40049b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
40149b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
40249b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
40349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
40449b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
40549b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
40649b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
40749b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
40849b5e25fSSatish Balay       v        += 16;
40949b5e25fSSatish Balay     }
41049b5e25fSSatish Balay     xb +=4; ai++;
41149b5e25fSSatish Balay   }
41249b5e25fSSatish Balay 
413d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
4141ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
415dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
41649b5e25fSSatish Balay   PetscFunctionReturn(0);
41749b5e25fSSatish Balay }
41849b5e25fSSatish Balay 
4194a2ae208SSatish Balay #undef __FUNCT__
4204a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_5"
421dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
42249b5e25fSSatish Balay {
42349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
424d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
425d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
426d9ca1df4SBarry Smith   const MatScalar   *v;
4276849ba73SBarry Smith   PetscErrorCode    ierr;
428d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
429d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
43098c9bda7SSatish Balay   PetscInt          nonzerorow=0;
43149b5e25fSSatish Balay 
43249b5e25fSSatish Balay   PetscFunctionBegin;
4332dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
434d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4351ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
43649b5e25fSSatish Balay 
43749b5e25fSSatish Balay   v  = a->a;
43849b5e25fSSatish Balay   xb = x;
43949b5e25fSSatish Balay 
44049b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
44149b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
44249b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
44349b5e25fSSatish Balay     ib          = aj + *ai;
444831a3094SHong Zhang     jmin        = 0;
44598c9bda7SSatish Balay     nonzerorow += (n>0);
4467fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
44749b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
44849b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
44949b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
45049b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
45149b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
452831a3094SHong Zhang       v        += 25; jmin++;
4537fbae186SHong Zhang     }
454444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
455444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
456831a3094SHong Zhang     for (j=jmin; j<n; j++) {
45749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
45849b5e25fSSatish Balay       cval       = ib[j]*5;
45949b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
46049b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
46149b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
46249b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
46349b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
46449b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
46549b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
46649b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
46749b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
46849b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
46949b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
47049b5e25fSSatish Balay       v        += 25;
47149b5e25fSSatish Balay     }
47249b5e25fSSatish Balay     xb +=5; ai++;
47349b5e25fSSatish Balay   }
47449b5e25fSSatish Balay 
475d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
4761ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
477dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
47849b5e25fSSatish Balay   PetscFunctionReturn(0);
47949b5e25fSSatish Balay }
48049b5e25fSSatish Balay 
48149b5e25fSSatish Balay 
4824a2ae208SSatish Balay #undef __FUNCT__
4834a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_6"
484dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
48549b5e25fSSatish Balay {
48649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
487d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
488d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
489d9ca1df4SBarry Smith   const MatScalar   *v;
4906849ba73SBarry Smith   PetscErrorCode    ierr;
491d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
492d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
49398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
49449b5e25fSSatish Balay 
49549b5e25fSSatish Balay   PetscFunctionBegin;
4962dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
497d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4981ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
49949b5e25fSSatish Balay 
50049b5e25fSSatish Balay   v  = a->a;
50149b5e25fSSatish Balay   xb = x;
50249b5e25fSSatish Balay 
50349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
50449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
50549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
50649b5e25fSSatish Balay     ib          = aj + *ai;
507831a3094SHong Zhang     jmin        = 0;
50898c9bda7SSatish Balay     nonzerorow += (n>0);
5097fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
51049b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
51149b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
51249b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
51349b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
51449b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
51549b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
516831a3094SHong Zhang       v        += 36; jmin++;
5177fbae186SHong Zhang     }
518444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
519444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
520831a3094SHong Zhang     for (j=jmin; j<n; j++) {
52149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
52249b5e25fSSatish Balay       cval       = ib[j]*6;
52349b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
52449b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
52549b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
52649b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
52749b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
52849b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
52949b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
53049b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
53149b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
53249b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
53349b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
53449b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
53549b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
53649b5e25fSSatish Balay       v        += 36;
53749b5e25fSSatish Balay     }
53849b5e25fSSatish Balay     xb +=6; ai++;
53949b5e25fSSatish Balay   }
54049b5e25fSSatish Balay 
541d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5421ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
543dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
54449b5e25fSSatish Balay   PetscFunctionReturn(0);
54549b5e25fSSatish Balay }
5464a2ae208SSatish Balay #undef __FUNCT__
5474a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_7"
548dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
54949b5e25fSSatish Balay {
55049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
551d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
552d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
553d9ca1df4SBarry Smith   const MatScalar   *v;
5546849ba73SBarry Smith   PetscErrorCode    ierr;
555d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
556d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
55798c9bda7SSatish Balay   PetscInt          nonzerorow=0;
55849b5e25fSSatish Balay 
55949b5e25fSSatish Balay   PetscFunctionBegin;
5602dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
561d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
5621ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
56349b5e25fSSatish Balay 
56449b5e25fSSatish Balay   v  = a->a;
56549b5e25fSSatish Balay   xb = x;
56649b5e25fSSatish Balay 
56749b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
56849b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
56949b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
57049b5e25fSSatish Balay     ib          = aj + *ai;
571831a3094SHong Zhang     jmin        = 0;
57298c9bda7SSatish Balay     nonzerorow += (n>0);
5737fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
57449b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
57549b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
57649b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
57749b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
57849b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
57949b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
58049b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
581831a3094SHong Zhang       v        += 49; jmin++;
5827fbae186SHong Zhang     }
583444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
584444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
585831a3094SHong Zhang     for (j=jmin; j<n; j++) {
58649b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
58749b5e25fSSatish Balay       cval       = ib[j]*7;
58849b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
58949b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
59049b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
59149b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
59249b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
59349b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
59449b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
59549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
59649b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
59749b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
59849b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
59949b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
60049b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
60149b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
60249b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
60349b5e25fSSatish Balay       v       += 49;
60449b5e25fSSatish Balay     }
60549b5e25fSSatish Balay     xb +=7; ai++;
60649b5e25fSSatish Balay   }
607d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
6081ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
609dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
61049b5e25fSSatish Balay   PetscFunctionReturn(0);
61149b5e25fSSatish Balay }
61249b5e25fSSatish Balay 
61349b5e25fSSatish Balay /*
61449b5e25fSSatish Balay     This will not work with MatScalar == float because it calls the BLAS
61549b5e25fSSatish Balay */
6164a2ae208SSatish Balay #undef __FUNCT__
6174a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_N"
618dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
61949b5e25fSSatish Balay {
62049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
621d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
622d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
623d9ca1df4SBarry Smith   const MatScalar   *v;
624dfbe8321SBarry Smith   PetscErrorCode    ierr;
625d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
626d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
62798c9bda7SSatish Balay   PetscInt          nonzerorow=0;
62849b5e25fSSatish Balay 
62949b5e25fSSatish Balay   PetscFunctionBegin;
6302dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
631d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x;
6321ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
63349b5e25fSSatish Balay 
63449b5e25fSSatish Balay   aj = a->j;
63549b5e25fSSatish Balay   v  = a->a;
63649b5e25fSSatish Balay   ii = a->i;
63749b5e25fSSatish Balay 
63849b5e25fSSatish Balay   if (!a->mult_work) {
639854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
64049b5e25fSSatish Balay   }
64149b5e25fSSatish Balay   work = a->mult_work;
64249b5e25fSSatish Balay 
64349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
64449b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
64549b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
64698c9bda7SSatish Balay     nonzerorow += (n>0);
64749b5e25fSSatish Balay 
64849b5e25fSSatish Balay     /* upper triangular part */
64949b5e25fSSatish Balay     for (j=0; j<n; j++) {
65049b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
65149b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
65249b5e25fSSatish Balay       workt += bs;
65349b5e25fSSatish Balay     }
65449b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
65596b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
65649b5e25fSSatish Balay 
65749b5e25fSSatish Balay     /* strict lower triangular part */
658831a3094SHong Zhang     idx = aj+ii[0];
659831a3094SHong Zhang     if (*idx == i) {
66096b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
661831a3094SHong Zhang     }
66296b9376eSHong Zhang 
66349b5e25fSSatish Balay     if (ncols > 0) {
66449b5e25fSSatish Balay       workt = work;
66587828ca2SBarry Smith       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
66696b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
667831a3094SHong Zhang       for (j=0; j<n; j++) {
668831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
66949b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
67049b5e25fSSatish Balay         workt += bs;
67149b5e25fSSatish Balay       }
67249b5e25fSSatish Balay     }
67349b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
67449b5e25fSSatish Balay   }
67549b5e25fSSatish Balay 
676d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
6771ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
678dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
67949b5e25fSSatish Balay   PetscFunctionReturn(0);
68049b5e25fSSatish Balay }
68149b5e25fSSatish Balay 
6824a2ae208SSatish Balay #undef __FUNCT__
6834a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
684dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
68549b5e25fSSatish Balay {
68649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
687d9ca1df4SBarry Smith   PetscScalar       *z,x1;
688d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
689d9ca1df4SBarry Smith   const MatScalar   *v;
6906849ba73SBarry Smith   PetscErrorCode    ierr;
691d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
692d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
69398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
69449b5e25fSSatish Balay 
69549b5e25fSSatish Balay   PetscFunctionBegin;
696343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
697d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
6981ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
69949b5e25fSSatish Balay   v    = a->a;
70049b5e25fSSatish Balay   xb   = x;
70149b5e25fSSatish Balay 
70249b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
70349b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th row of A */
70449b5e25fSSatish Balay     x1          = xb[0];
70549b5e25fSSatish Balay     ib          = aj + *ai;
706831a3094SHong Zhang     jmin        = 0;
70798c9bda7SSatish Balay     nonzerorow += (n>0);
708831a3094SHong Zhang     if (*ib == i) {            /* (diag of A)*x */
709831a3094SHong Zhang       z[i] += *v++ * x[*ib++]; jmin++;
710831a3094SHong Zhang     }
711831a3094SHong Zhang     for (j=jmin; j<n; j++) {
71249b5e25fSSatish Balay       cval    = *ib;
71349b5e25fSSatish Balay       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
71449b5e25fSSatish Balay       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
71549b5e25fSSatish Balay     }
71649b5e25fSSatish Balay     xb++; ai++;
71749b5e25fSSatish Balay   }
71849b5e25fSSatish Balay 
719d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
720bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
72149b5e25fSSatish Balay 
722dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
72349b5e25fSSatish Balay   PetscFunctionReturn(0);
72449b5e25fSSatish Balay }
72549b5e25fSSatish Balay 
7264a2ae208SSatish Balay #undef __FUNCT__
7274a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
728dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
72949b5e25fSSatish Balay {
73049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
731d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2;
732d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
733d9ca1df4SBarry Smith   const MatScalar   *v;
7346849ba73SBarry Smith   PetscErrorCode    ierr;
735d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
736d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
73798c9bda7SSatish Balay   PetscInt          nonzerorow=0;
73849b5e25fSSatish Balay 
73949b5e25fSSatish Balay   PetscFunctionBegin;
740343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
741d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7421ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
74349b5e25fSSatish Balay 
74449b5e25fSSatish Balay   v  = a->a;
74549b5e25fSSatish Balay   xb = x;
74649b5e25fSSatish Balay 
74749b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
74849b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
74949b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
75049b5e25fSSatish Balay     ib          = aj + *ai;
751831a3094SHong Zhang     jmin        = 0;
75298c9bda7SSatish Balay     nonzerorow += (n>0);
7537fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
75449b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
75549b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
756831a3094SHong Zhang       v        += 4; jmin++;
7577fbae186SHong Zhang     }
758444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
759444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
760831a3094SHong Zhang     for (j=jmin; j<n; j++) {
76149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
76249b5e25fSSatish Balay       cval       = ib[j]*2;
76349b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
76449b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
76549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
76649b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
76749b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
76849b5e25fSSatish Balay       v        += 4;
76949b5e25fSSatish Balay     }
77049b5e25fSSatish Balay     xb +=2; ai++;
77149b5e25fSSatish Balay   }
772d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
773bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
77449b5e25fSSatish Balay 
775dc0b31edSSatish Balay   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
77649b5e25fSSatish Balay   PetscFunctionReturn(0);
77749b5e25fSSatish Balay }
77849b5e25fSSatish Balay 
7794a2ae208SSatish Balay #undef __FUNCT__
7804a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
781dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
78249b5e25fSSatish Balay {
78349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
784d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3;
785d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
786d9ca1df4SBarry Smith   const MatScalar   *v;
7876849ba73SBarry Smith   PetscErrorCode    ierr;
788d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
789d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
79098c9bda7SSatish Balay   PetscInt          nonzerorow=0;
79149b5e25fSSatish Balay 
79249b5e25fSSatish Balay   PetscFunctionBegin;
793343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
794d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7951ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
79649b5e25fSSatish Balay 
79749b5e25fSSatish Balay   v  = a->a;
79849b5e25fSSatish Balay   xb = x;
79949b5e25fSSatish Balay 
80049b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
80149b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
80249b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
80349b5e25fSSatish Balay     ib          = aj + *ai;
804831a3094SHong Zhang     jmin        = 0;
80598c9bda7SSatish Balay     nonzerorow += (n>0);
8067fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
80749b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
80849b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
80949b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
810831a3094SHong Zhang       v        += 9; jmin++;
8117fbae186SHong Zhang     }
812444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
813444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
814831a3094SHong Zhang     for (j=jmin; j<n; j++) {
81549b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
81649b5e25fSSatish Balay       cval       = ib[j]*3;
81749b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
81849b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
81949b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
82049b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
82149b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
82249b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
82349b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
82449b5e25fSSatish Balay       v        += 9;
82549b5e25fSSatish Balay     }
82649b5e25fSSatish Balay     xb +=3; ai++;
82749b5e25fSSatish Balay   }
82849b5e25fSSatish Balay 
829d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
830bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
83149b5e25fSSatish Balay 
832dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
83349b5e25fSSatish Balay   PetscFunctionReturn(0);
83449b5e25fSSatish Balay }
83549b5e25fSSatish Balay 
8364a2ae208SSatish Balay #undef __FUNCT__
8374a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
838dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
83949b5e25fSSatish Balay {
84049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
841d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4;
842d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
843d9ca1df4SBarry Smith   const MatScalar   *v;
8446849ba73SBarry Smith   PetscErrorCode    ierr;
845d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
846d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
84798c9bda7SSatish Balay   PetscInt          nonzerorow=0;
84849b5e25fSSatish Balay 
84949b5e25fSSatish Balay   PetscFunctionBegin;
850343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
851d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8521ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
85349b5e25fSSatish Balay 
85449b5e25fSSatish Balay   v  = a->a;
85549b5e25fSSatish Balay   xb = x;
85649b5e25fSSatish Balay 
85749b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
85849b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
85949b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
86049b5e25fSSatish Balay     ib          = aj + *ai;
861831a3094SHong Zhang     jmin        = 0;
86298c9bda7SSatish Balay     nonzerorow += (n>0);
8637fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
86449b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
86549b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
86649b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
86749b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
868831a3094SHong Zhang       v        += 16; jmin++;
8697fbae186SHong Zhang     }
870444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
871444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
872831a3094SHong Zhang     for (j=jmin; j<n; j++) {
87349b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
87449b5e25fSSatish Balay       cval       = ib[j]*4;
87549b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
87649b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
87749b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
87849b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
87949b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
88049b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
88149b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
88249b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
88349b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
88449b5e25fSSatish Balay       v        += 16;
88549b5e25fSSatish Balay     }
88649b5e25fSSatish Balay     xb +=4; ai++;
88749b5e25fSSatish Balay   }
88849b5e25fSSatish Balay 
889d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
890bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
89149b5e25fSSatish Balay 
892dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
89349b5e25fSSatish Balay   PetscFunctionReturn(0);
89449b5e25fSSatish Balay }
89549b5e25fSSatish Balay 
8964a2ae208SSatish Balay #undef __FUNCT__
8974a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
898dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
89949b5e25fSSatish Balay {
90049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
901d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5;
902d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
903d9ca1df4SBarry Smith   const MatScalar   *v;
9046849ba73SBarry Smith   PetscErrorCode    ierr;
905d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
906d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
90798c9bda7SSatish Balay   PetscInt          nonzerorow=0;
90849b5e25fSSatish Balay 
90949b5e25fSSatish Balay   PetscFunctionBegin;
910343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
911d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
9121ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
91349b5e25fSSatish Balay 
91449b5e25fSSatish Balay   v  = a->a;
91549b5e25fSSatish Balay   xb = x;
91649b5e25fSSatish Balay 
91749b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
91849b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
91949b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
92049b5e25fSSatish Balay     ib          = aj + *ai;
921831a3094SHong Zhang     jmin        = 0;
92298c9bda7SSatish Balay     nonzerorow += (n>0);
9237fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
92449b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
92549b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
92649b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
92749b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
92849b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
929831a3094SHong Zhang       v        += 25; jmin++;
9307fbae186SHong Zhang     }
931444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
932444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
933831a3094SHong Zhang     for (j=jmin; j<n; j++) {
93449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
93549b5e25fSSatish Balay       cval       = ib[j]*5;
93649b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
93749b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
93849b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
93949b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
94049b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
94149b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
94249b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
94349b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
94449b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
94549b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
94649b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
94749b5e25fSSatish Balay       v        += 25;
94849b5e25fSSatish Balay     }
94949b5e25fSSatish Balay     xb +=5; ai++;
95049b5e25fSSatish Balay   }
95149b5e25fSSatish Balay 
952d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
953bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
95449b5e25fSSatish Balay 
955dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
95649b5e25fSSatish Balay   PetscFunctionReturn(0);
95749b5e25fSSatish Balay }
9584a2ae208SSatish Balay #undef __FUNCT__
9594a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
960dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
96149b5e25fSSatish Balay {
96249b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
963d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
964d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
965d9ca1df4SBarry Smith   const MatScalar   *v;
9666849ba73SBarry Smith   PetscErrorCode    ierr;
967d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
968d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
96998c9bda7SSatish Balay   PetscInt          nonzerorow=0;
97049b5e25fSSatish Balay 
97149b5e25fSSatish Balay   PetscFunctionBegin;
972343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
973d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
9741ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
97549b5e25fSSatish Balay 
97649b5e25fSSatish Balay   v  = a->a;
97749b5e25fSSatish Balay   xb = x;
97849b5e25fSSatish Balay 
97949b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
98049b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
98149b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
98249b5e25fSSatish Balay     ib          = aj + *ai;
983831a3094SHong Zhang     jmin        = 0;
98498c9bda7SSatish Balay     nonzerorow += (n>0);
9857fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
98649b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
98749b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
98849b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
98949b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
99049b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
99149b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
992831a3094SHong Zhang       v        += 36; jmin++;
9937fbae186SHong Zhang     }
994444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
995444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
996831a3094SHong Zhang     for (j=jmin; j<n; j++) {
99749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
99849b5e25fSSatish Balay       cval       = ib[j]*6;
99949b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
100049b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
100149b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
100249b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
100349b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
100449b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
100549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
100649b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
100749b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
100849b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
100949b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
101049b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
101149b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
101249b5e25fSSatish Balay       v        += 36;
101349b5e25fSSatish Balay     }
101449b5e25fSSatish Balay     xb +=6; ai++;
101549b5e25fSSatish Balay   }
101649b5e25fSSatish Balay 
1017d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1018bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
101949b5e25fSSatish Balay 
1020dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
102149b5e25fSSatish Balay   PetscFunctionReturn(0);
102249b5e25fSSatish Balay }
102349b5e25fSSatish Balay 
10244a2ae208SSatish Balay #undef __FUNCT__
10254a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1026dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
102749b5e25fSSatish Balay {
102849b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1029d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1030d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
1031d9ca1df4SBarry Smith   const MatScalar   *v;
10326849ba73SBarry Smith   PetscErrorCode    ierr;
1033d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1034d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
103598c9bda7SSatish Balay   PetscInt          nonzerorow=0;
103649b5e25fSSatish Balay 
103749b5e25fSSatish Balay   PetscFunctionBegin;
1038343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1039d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
10401ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
104149b5e25fSSatish Balay 
104249b5e25fSSatish Balay   v  = a->a;
104349b5e25fSSatish Balay   xb = x;
104449b5e25fSSatish Balay 
104549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
104649b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
104749b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
104849b5e25fSSatish Balay     ib          = aj + *ai;
1049831a3094SHong Zhang     jmin        = 0;
105098c9bda7SSatish Balay     nonzerorow += (n>0);
10517fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
105249b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
105349b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
105449b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
105549b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
105649b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
105749b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
105849b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1059831a3094SHong Zhang       v        += 49; jmin++;
10607fbae186SHong Zhang     }
1061444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1062444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1063831a3094SHong Zhang     for (j=jmin; j<n; j++) {
106449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
106549b5e25fSSatish Balay       cval       = ib[j]*7;
106649b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
106749b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
106849b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
106949b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
107049b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
107149b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
107249b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
107349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
107449b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
107549b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
107649b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
107749b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
107849b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
107949b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
108049b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
108149b5e25fSSatish Balay       v       += 49;
108249b5e25fSSatish Balay     }
108349b5e25fSSatish Balay     xb +=7; ai++;
108449b5e25fSSatish Balay   }
108549b5e25fSSatish Balay 
1086d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1087bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
108849b5e25fSSatish Balay 
1089dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
109049b5e25fSSatish Balay   PetscFunctionReturn(0);
109149b5e25fSSatish Balay }
109249b5e25fSSatish Balay 
10934a2ae208SSatish Balay #undef __FUNCT__
10944a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1095dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
109649b5e25fSSatish Balay {
109749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1098d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1099d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
1100d9ca1df4SBarry Smith   const MatScalar   *v;
1101dfbe8321SBarry Smith   PetscErrorCode    ierr;
1102d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1103d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
110498c9bda7SSatish Balay   PetscInt          nonzerorow=0;
110549b5e25fSSatish Balay 
110649b5e25fSSatish Balay   PetscFunctionBegin;
1107343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1108d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
11091ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
111049b5e25fSSatish Balay 
111149b5e25fSSatish Balay   aj = a->j;
111249b5e25fSSatish Balay   v  = a->a;
111349b5e25fSSatish Balay   ii = a->i;
111449b5e25fSSatish Balay 
111549b5e25fSSatish Balay   if (!a->mult_work) {
1116854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
111749b5e25fSSatish Balay   }
111849b5e25fSSatish Balay   work = a->mult_work;
111949b5e25fSSatish Balay 
112049b5e25fSSatish Balay 
112149b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
112249b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
112349b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
112498c9bda7SSatish Balay     nonzerorow += (n>0);
112549b5e25fSSatish Balay 
112649b5e25fSSatish Balay     /* upper triangular part */
112749b5e25fSSatish Balay     for (j=0; j<n; j++) {
112849b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
112949b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
113049b5e25fSSatish Balay       workt += bs;
113149b5e25fSSatish Balay     }
113249b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
113396b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
113449b5e25fSSatish Balay 
113549b5e25fSSatish Balay     /* strict lower triangular part */
1136831a3094SHong Zhang     idx = aj+ii[0];
1137831a3094SHong Zhang     if (*idx == i) {
113896b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
1139831a3094SHong Zhang     }
114049b5e25fSSatish Balay     if (ncols > 0) {
114149b5e25fSSatish Balay       workt = work;
114287828ca2SBarry Smith       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
114396b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1144831a3094SHong Zhang       for (j=0; j<n; j++) {
1145831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
114649b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
114749b5e25fSSatish Balay         workt += bs;
114849b5e25fSSatish Balay       }
114949b5e25fSSatish Balay     }
115049b5e25fSSatish Balay 
115149b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
115249b5e25fSSatish Balay   }
115349b5e25fSSatish Balay 
1154d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1155bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
115649b5e25fSSatish Balay 
1157dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
115849b5e25fSSatish Balay   PetscFunctionReturn(0);
115949b5e25fSSatish Balay }
116049b5e25fSSatish Balay 
11614a2ae208SSatish Balay #undef __FUNCT__
11624a2ae208SSatish Balay #define __FUNCT__ "MatScale_SeqSBAIJ"
1163f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
116449b5e25fSSatish Balay {
116549b5e25fSSatish Balay   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1166f4df32b1SMatthew Knepley   PetscScalar    oalpha = alpha;
1167efee365bSSatish Balay   PetscErrorCode ierr;
1168c5df96a5SBarry Smith   PetscBLASInt   one = 1,totalnz;
116949b5e25fSSatish Balay 
117049b5e25fSSatish Balay   PetscFunctionBegin;
1171c5df96a5SBarry Smith   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
11728b83055fSJed Brown   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1173efee365bSSatish Balay   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
117449b5e25fSSatish Balay   PetscFunctionReturn(0);
117549b5e25fSSatish Balay }
117649b5e25fSSatish Balay 
11774a2ae208SSatish Balay #undef __FUNCT__
11784a2ae208SSatish Balay #define __FUNCT__ "MatNorm_SeqSBAIJ"
1179dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
118049b5e25fSSatish Balay {
118149b5e25fSSatish Balay   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1182d9ca1df4SBarry Smith   const MatScalar *v       = a->a;
118349b5e25fSSatish Balay   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1184d9ca1df4SBarry Smith   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
11856849ba73SBarry Smith   PetscErrorCode  ierr;
1186d9ca1df4SBarry Smith   const PetscInt  *aj=a->j,*col;
118749b5e25fSSatish Balay 
118849b5e25fSSatish Balay   PetscFunctionBegin;
118949b5e25fSSatish Balay   if (type == NORM_FROBENIUS) {
119049b5e25fSSatish Balay     for (k=0; k<mbs; k++) {
119149b5e25fSSatish Balay       jmin = a->i[k]; jmax = a->i[k+1];
1192831a3094SHong Zhang       col  = aj + jmin;
1193831a3094SHong Zhang       if (*col == k) {         /* diagonal block */
119449b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
119549b5e25fSSatish Balay           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
119649b5e25fSSatish Balay         }
1197831a3094SHong Zhang         jmin++;
1198831a3094SHong Zhang       }
1199831a3094SHong Zhang       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
120049b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
120149b5e25fSSatish Balay           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
120249b5e25fSSatish Balay         }
120349b5e25fSSatish Balay       }
120449b5e25fSSatish Balay     }
12058f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
12060b8dc8d2SHong Zhang   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1207dcca6d9dSJed Brown     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
12080b8dc8d2SHong Zhang     for (i=0; i<mbs; i++) jl[i] = mbs;
12090b8dc8d2SHong Zhang     il[0] = 0;
121049b5e25fSSatish Balay 
121149b5e25fSSatish Balay     *norm = 0.0;
121249b5e25fSSatish Balay     for (k=0; k<mbs; k++) { /* k_th block row */
121349b5e25fSSatish Balay       for (j=0; j<bs; j++) sum[j]=0.0;
121449b5e25fSSatish Balay       /*-- col sum --*/
121549b5e25fSSatish Balay       i = jl[k]; /* first |A(i,k)| to be added */
1216831a3094SHong Zhang       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1217831a3094SHong Zhang                   at step k */
121849b5e25fSSatish Balay       while (i<mbs) {
121949b5e25fSSatish Balay         nexti = jl[i];  /* next block row to be added */
122049b5e25fSSatish Balay         ik    = il[i];  /* block index of A(i,k) in the array a */
122149b5e25fSSatish Balay         for (j=0; j<bs; j++) {
122249b5e25fSSatish Balay           v = a->a + ik*bs2 + j*bs;
122349b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
122449b5e25fSSatish Balay             sum[j] += PetscAbsScalar(*v); v++;
122549b5e25fSSatish Balay           }
122649b5e25fSSatish Balay         }
122749b5e25fSSatish Balay         /* update il, jl */
1228831a3094SHong Zhang         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1229831a3094SHong Zhang         jmax = a->i[i+1];
123049b5e25fSSatish Balay         if (jmin < jmax) {
123149b5e25fSSatish Balay           il[i] = jmin;
123249b5e25fSSatish Balay           j     = a->j[jmin];
123349b5e25fSSatish Balay           jl[i] = jl[j]; jl[j]=i;
123449b5e25fSSatish Balay         }
123549b5e25fSSatish Balay         i = nexti;
123649b5e25fSSatish Balay       }
123749b5e25fSSatish Balay       /*-- row sum --*/
123849b5e25fSSatish Balay       jmin = a->i[k]; jmax = a->i[k+1];
123949b5e25fSSatish Balay       for (i=jmin; i<jmax; i++) {
124049b5e25fSSatish Balay         for (j=0; j<bs; j++) {
124149b5e25fSSatish Balay           v = a->a + i*bs2 + j;
124249b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
12430b8dc8d2SHong Zhang             sum[j] += PetscAbsScalar(*v); v += bs;
124449b5e25fSSatish Balay           }
124549b5e25fSSatish Balay         }
124649b5e25fSSatish Balay       }
124749b5e25fSSatish Balay       /* add k_th block row to il, jl */
1248831a3094SHong Zhang       col = aj+jmin;
1249831a3094SHong Zhang       if (*col == k) jmin++;
125049b5e25fSSatish Balay       if (jmin < jmax) {
125149b5e25fSSatish Balay         il[k] = jmin;
12520b8dc8d2SHong Zhang         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
125349b5e25fSSatish Balay       }
125449b5e25fSSatish Balay       for (j=0; j<bs; j++) {
125549b5e25fSSatish Balay         if (sum[j] > *norm) *norm = sum[j];
125649b5e25fSSatish Balay       }
125749b5e25fSSatish Balay     }
125874ed9c26SBarry Smith     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1259f23aa3ddSBarry Smith   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
126049b5e25fSSatish Balay   PetscFunctionReturn(0);
126149b5e25fSSatish Balay }
126249b5e25fSSatish Balay 
12634a2ae208SSatish Balay #undef __FUNCT__
12644a2ae208SSatish Balay #define __FUNCT__ "MatEqual_SeqSBAIJ"
1265ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
126649b5e25fSSatish Balay {
126749b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1268dfbe8321SBarry Smith   PetscErrorCode ierr;
126949b5e25fSSatish Balay 
127049b5e25fSSatish Balay   PetscFunctionBegin;
127149b5e25fSSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1272d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1273ef511fbeSHong Zhang     *flg = PETSC_FALSE;
1274ef511fbeSHong Zhang     PetscFunctionReturn(0);
127549b5e25fSSatish Balay   }
127649b5e25fSSatish Balay 
127749b5e25fSSatish Balay   /* if the a->i are the same */
127813f74950SBarry Smith   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
127926fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
128049b5e25fSSatish Balay 
128149b5e25fSSatish Balay   /* if a->j are the same */
128213f74950SBarry Smith   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
128326fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
128426fbe8dcSKarl Rupp 
128549b5e25fSSatish Balay   /* if a->a are the same */
1286d0f46423SBarry Smith   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1287935af2e7SHong Zhang   PetscFunctionReturn(0);
128849b5e25fSSatish Balay }
128949b5e25fSSatish Balay 
12904a2ae208SSatish Balay #undef __FUNCT__
12914a2ae208SSatish Balay #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1292dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
129349b5e25fSSatish Balay {
129449b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1295dfbe8321SBarry Smith   PetscErrorCode  ierr;
1296d9ca1df4SBarry Smith   PetscInt        i,j,k,row,bs,ambs,bs2;
1297d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
129887828ca2SBarry Smith   PetscScalar     *x,zero = 0.0;
1299d9ca1df4SBarry Smith   const MatScalar *aa,*aa_j;
130049b5e25fSSatish Balay 
130149b5e25fSSatish Balay   PetscFunctionBegin;
1302d0f46423SBarry Smith   bs = A->rmap->bs;
1303e32f2f54SBarry Smith   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
130482799104SHong Zhang 
130549b5e25fSSatish Balay   aa   = a->a;
13068a0c6efdSHong Zhang   ambs = a->mbs;
13078a0c6efdSHong Zhang 
13088a0c6efdSHong Zhang   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
13098a0c6efdSHong Zhang     PetscInt *diag=a->diag;
13108a0c6efdSHong Zhang     aa   = a->a;
13118a0c6efdSHong Zhang     ambs = a->mbs;
13128a0c6efdSHong Zhang     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
13138a0c6efdSHong Zhang     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
13148a0c6efdSHong Zhang     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
13158a0c6efdSHong Zhang     PetscFunctionReturn(0);
13168a0c6efdSHong Zhang   }
13178a0c6efdSHong Zhang 
131849b5e25fSSatish Balay   ai   = a->i;
131949b5e25fSSatish Balay   aj   = a->j;
132049b5e25fSSatish Balay   bs2  = a->bs2;
13212dcb1b2aSMatthew Knepley   ierr = VecSet(v,zero);CHKERRQ(ierr);
13221ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
132349b5e25fSSatish Balay   for (i=0; i<ambs; i++) {
132449b5e25fSSatish Balay     j=ai[i];
132549b5e25fSSatish Balay     if (aj[j] == i) {    /* if this is a diagonal element */
132649b5e25fSSatish Balay       row  = i*bs;
132749b5e25fSSatish Balay       aa_j = aa + j*bs2;
132849b5e25fSSatish Balay       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
132949b5e25fSSatish Balay     }
133049b5e25fSSatish Balay   }
13311ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
133249b5e25fSSatish Balay   PetscFunctionReturn(0);
133349b5e25fSSatish Balay }
133449b5e25fSSatish Balay 
13354a2ae208SSatish Balay #undef __FUNCT__
13364a2ae208SSatish Balay #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1337dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
133849b5e25fSSatish Balay {
133949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1340d9ca1df4SBarry Smith   PetscScalar       x;
1341d9ca1df4SBarry Smith   const PetscScalar *l,*li,*ri;
134249b5e25fSSatish Balay   MatScalar         *aa,*v;
1343dfbe8321SBarry Smith   PetscErrorCode    ierr;
13445e90f9d9SHong Zhang   PetscInt          i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1345ace3abfcSBarry Smith   PetscBool         flg;
134649b5e25fSSatish Balay 
134749b5e25fSSatish Balay   PetscFunctionBegin;
1348b3bf805bSHong Zhang   if (ll != rr) {
1349b3bf805bSHong Zhang     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1350e7e72b3dSBarry Smith     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1351b3bf805bSHong Zhang   }
1352b3bf805bSHong Zhang   if (!ll) PetscFunctionReturn(0);
135349b5e25fSSatish Balay   ai  = a->i;
135449b5e25fSSatish Balay   aj  = a->j;
135549b5e25fSSatish Balay   aa  = a->a;
1356d0f46423SBarry Smith   m   = A->rmap->N;
1357d0f46423SBarry Smith   bs  = A->rmap->bs;
135849b5e25fSSatish Balay   mbs = a->mbs;
135949b5e25fSSatish Balay   bs2 = a->bs2;
136049b5e25fSSatish Balay 
1361d9ca1df4SBarry Smith   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
136249b5e25fSSatish Balay   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1363e32f2f54SBarry Smith   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
136449b5e25fSSatish Balay   for (i=0; i<mbs; i++) { /* for each block row */
136549b5e25fSSatish Balay     M  = ai[i+1] - ai[i];
136649b5e25fSSatish Balay     li = l + i*bs;
136749b5e25fSSatish Balay     v  = aa + bs2*ai[i];
136849b5e25fSSatish Balay     for (j=0; j<M; j++) { /* for each block */
136949b5e25fSSatish Balay       ri = l + bs*aj[ai[i]+j];
13705e90f9d9SHong Zhang       for (k=0; k<bs; k++) {
137149b5e25fSSatish Balay         x = ri[k];
137249b5e25fSSatish Balay         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
137349b5e25fSSatish Balay       }
137449b5e25fSSatish Balay     }
137549b5e25fSSatish Balay   }
1376d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1377dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
137849b5e25fSSatish Balay   PetscFunctionReturn(0);
137949b5e25fSSatish Balay }
138049b5e25fSSatish Balay 
13814a2ae208SSatish Balay #undef __FUNCT__
13824a2ae208SSatish Balay #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1383dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
138449b5e25fSSatish Balay {
138549b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
138649b5e25fSSatish Balay 
138749b5e25fSSatish Balay   PetscFunctionBegin;
138849b5e25fSSatish Balay   info->block_size   = a->bs2;
1389ceed8ce5SJed Brown   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
13906c6c5352SBarry Smith   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
139149b5e25fSSatish Balay   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
139249b5e25fSSatish Balay   info->assemblies   = A->num_ass;
13938e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
13947adad957SLisandro Dalcin   info->memory       = ((PetscObject)A)->mem;
1395d5f3da31SBarry Smith   if (A->factortype) {
139649b5e25fSSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
139749b5e25fSSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
139849b5e25fSSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
139949b5e25fSSatish Balay   } else {
140049b5e25fSSatish Balay     info->fill_ratio_given  = 0;
140149b5e25fSSatish Balay     info->fill_ratio_needed = 0;
140249b5e25fSSatish Balay     info->factor_mallocs    = 0;
140349b5e25fSSatish Balay   }
140449b5e25fSSatish Balay   PetscFunctionReturn(0);
140549b5e25fSSatish Balay }
140649b5e25fSSatish Balay 
140749b5e25fSSatish Balay 
14084a2ae208SSatish Balay #undef __FUNCT__
14094a2ae208SSatish Balay #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1410dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
141149b5e25fSSatish Balay {
141249b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1413dfbe8321SBarry Smith   PetscErrorCode ierr;
141449b5e25fSSatish Balay 
141549b5e25fSSatish Balay   PetscFunctionBegin;
141649b5e25fSSatish Balay   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
141749b5e25fSSatish Balay   PetscFunctionReturn(0);
141849b5e25fSSatish Balay }
1419dc354874SHong Zhang 
14204a2ae208SSatish Balay #undef __FUNCT__
1421985db425SBarry Smith #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1422985db425SBarry Smith /*
1423985db425SBarry Smith    This code does not work since it only checks the upper triangular part of
1424985db425SBarry Smith   the matrix. Hence it is not listed in the function table.
1425985db425SBarry Smith */
1426985db425SBarry Smith PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1427dc354874SHong Zhang {
1428dc354874SHong Zhang   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1429dfbe8321SBarry Smith   PetscErrorCode  ierr;
1430d9ca1df4SBarry Smith   PetscInt        i,j,n,row,col,bs,mbs;
1431d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
1432c3fca9a7SHong Zhang   PetscReal       atmp;
1433d9ca1df4SBarry Smith   const MatScalar *aa;
1434985db425SBarry Smith   PetscScalar     *x;
143513f74950SBarry Smith   PetscInt        ncols,brow,bcol,krow,kcol;
1436dc354874SHong Zhang 
1437dc354874SHong Zhang   PetscFunctionBegin;
1438e32f2f54SBarry Smith   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1439e32f2f54SBarry Smith   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1440d0f46423SBarry Smith   bs  = A->rmap->bs;
1441dc354874SHong Zhang   aa  = a->a;
1442dc354874SHong Zhang   ai  = a->i;
1443dc354874SHong Zhang   aj  = a->j;
144444117c81SHong Zhang   mbs = a->mbs;
1445dc354874SHong Zhang 
1446985db425SBarry Smith   ierr = VecSet(v,0.0);CHKERRQ(ierr);
14471ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1448dc354874SHong Zhang   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1449e32f2f54SBarry Smith   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
145044117c81SHong Zhang   for (i=0; i<mbs; i++) {
1451d0f6400bSHong Zhang     ncols = ai[1] - ai[0]; ai++;
1452d0f6400bSHong Zhang     brow  = bs*i;
145344117c81SHong Zhang     for (j=0; j<ncols; j++) {
1454d0f6400bSHong Zhang       bcol = bs*(*aj);
145544117c81SHong Zhang       for (kcol=0; kcol<bs; kcol++) {
1456d0f6400bSHong Zhang         col = bcol + kcol;      /* col index */
145744117c81SHong Zhang         for (krow=0; krow<bs; krow++) {
1458d0f6400bSHong Zhang           atmp = PetscAbsScalar(*aa); aa++;
1459d0f6400bSHong Zhang           row  = brow + krow;   /* row index */
1460c3fca9a7SHong Zhang           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1461c3fca9a7SHong Zhang           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
146244117c81SHong Zhang         }
146344117c81SHong Zhang       }
1464d0f6400bSHong Zhang       aj++;
1465dc354874SHong Zhang     }
1466dc354874SHong Zhang   }
14671ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1468dc354874SHong Zhang   PetscFunctionReturn(0);
1469dc354874SHong Zhang }
1470