xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 8f46ffca725d709c802a056bffe944fc7c220922)
149b5e25fSSatish Balay 
2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
306873bf2SBarry Smith #include <petsc-private/kernels/blockinvert.h>
4c6db04a5SJed Brown #include <petscbt.h>
5c6db04a5SJed Brown #include <../src/mat/impls/sbaij/seq/sbaij.h>
6c6db04a5SJed Brown #include <petscblaslapack.h>
749b5e25fSSatish Balay 
84a2ae208SSatish Balay #undef __FUNCT__
94a2ae208SSatish Balay #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
1013f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
1149b5e25fSSatish Balay {
125eee224dSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
136849ba73SBarry Smith   PetscErrorCode ierr;
145d0c19d7SBarry Smith   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
155d0c19d7SBarry Smith   const PetscInt *idx;
16db41cccfSHong Zhang   PetscBT        table_out,table_in;
17d94109b8SHong Zhang 
18d94109b8SHong Zhang   PetscFunctionBegin;
19e32f2f54SBarry Smith   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20d94109b8SHong Zhang   mbs  = a->mbs;
21d94109b8SHong Zhang   ai   = a->i;
22d94109b8SHong Zhang   aj   = a->j;
23d0f46423SBarry Smith   bs   = A->rmap->bs;
2453b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
25854ce69bSBarry Smith   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
26854ce69bSBarry Smith   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
2753b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
28d94109b8SHong Zhang 
29d94109b8SHong Zhang   for (i=0; i<is_max; i++) { /* for each is */
30d94109b8SHong Zhang     isz  = 0;
31db41cccfSHong Zhang     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
32d94109b8SHong Zhang 
33d94109b8SHong Zhang     /* Extract the indices, assume there can be duplicate entries */
34d94109b8SHong Zhang     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35d94109b8SHong Zhang     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36d94109b8SHong Zhang 
37db41cccfSHong Zhang     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
38dbe03f88SHong Zhang     bcol_max = 0;
39d94109b8SHong Zhang     for (j=0; j<n; ++j) {
40d94109b8SHong Zhang       brow = idx[j]/bs; /* convert the indices into block indices */
41e32f2f54SBarry Smith       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42db41cccfSHong Zhang       if (!PetscBTLookupSet(table_out,brow)) {
43dbe03f88SHong Zhang         nidx[isz++] = brow;
44dbe03f88SHong Zhang         if (bcol_max < brow) bcol_max = brow;
45dbe03f88SHong Zhang       }
46d94109b8SHong Zhang     }
47d94109b8SHong Zhang     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
486bf464f9SBarry Smith     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
49d94109b8SHong Zhang 
50d94109b8SHong Zhang     k = 0;
51d94109b8SHong Zhang     for (j=0; j<ov; j++) { /* for each overlap */
52db41cccfSHong Zhang       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53db41cccfSHong Zhang       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
54db41cccfSHong Zhang       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
55d94109b8SHong Zhang 
56d94109b8SHong Zhang       n = isz;  /* length of the updated is[i] */
57d94109b8SHong Zhang       for (brow=0; brow<mbs; brow++) {
58d94109b8SHong Zhang         start = ai[brow]; end   = ai[brow+1];
59db41cccfSHong Zhang         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
60d94109b8SHong Zhang           for (l = start; l<end; l++) {
61d94109b8SHong Zhang             bcol = aj[l];
62d7b97159SHong Zhang             if (!PetscBTLookupSet(table_out,bcol)) {
63d7b97159SHong Zhang               nidx[isz++] = bcol;
64d7b97159SHong Zhang               if (bcol_max < bcol) bcol_max = bcol;
65d7b97159SHong Zhang             }
66d94109b8SHong Zhang           }
67d94109b8SHong Zhang           k++;
68d94109b8SHong Zhang           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
69d94109b8SHong Zhang         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
70d94109b8SHong Zhang           for (l = start; l<end; l++) {
71d94109b8SHong Zhang             bcol = aj[l];
72dbe03f88SHong Zhang             if (bcol > bcol_max) break;
73db41cccfSHong Zhang             if (PetscBTLookup(table_in,bcol)) {
7426fbe8dcSKarl Rupp               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
75d94109b8SHong Zhang               break; /* for l = start; l<end ; l++) */
76d94109b8SHong Zhang             }
77d94109b8SHong Zhang           }
78d94109b8SHong Zhang         }
79d94109b8SHong Zhang       }
80d94109b8SHong Zhang     } /* for each overlap */
81d94109b8SHong Zhang 
82d94109b8SHong Zhang     /* expand the Index Set */
83d94109b8SHong Zhang     for (j=0; j<isz; j++) {
8426fbe8dcSKarl Rupp       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
85d94109b8SHong Zhang     }
8670b3c8c7SBarry Smith     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
87d94109b8SHong Zhang   }
8894bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
89d94109b8SHong Zhang   ierr = PetscFree(nidx);CHKERRQ(ierr);
90d94109b8SHong Zhang   ierr = PetscFree(nidx2);CHKERRQ(ierr);
9194bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
925eee224dSHong Zhang   PetscFunctionReturn(0);
9349b5e25fSSatish Balay }
9449b5e25fSSatish Balay 
954a2ae208SSatish Balay #undef __FUNCT__
964a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
97*8f46ffcaSHong Zhang PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) //rm
9849b5e25fSSatish Balay {
9949b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data,*c;
1006849ba73SBarry Smith   PetscErrorCode  ierr;
10113f74950SBarry Smith   PetscInt        *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
10213f74950SBarry Smith   PetscInt        row,mat_i,*mat_j,tcol,*mat_ilen;
1035d0c19d7SBarry Smith   PetscInt        nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
1045d0c19d7SBarry Smith   const PetscInt  *irow,*aj = a->j,*ai = a->i;
10549b5e25fSSatish Balay   MatScalar       *mat_a;
10649b5e25fSSatish Balay   Mat             C;
107ace3abfcSBarry Smith   PetscBool       flag,sorted;
10849b5e25fSSatish Balay 
10949b5e25fSSatish Balay   PetscFunctionBegin;
110*8f46ffcaSHong Zhang   if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isrow"); //rm!!!
111*8f46ffcaSHong Zhang   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); //rm!!!
112*8f46ffcaSHong Zhang   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); //rm!!!
11349b5e25fSSatish Balay 
11449b5e25fSSatish Balay   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
11549b5e25fSSatish Balay   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
11649b5e25fSSatish Balay 
117785e854fSJed Brown   ierr  = PetscMalloc1(oldcols,&smap);CHKERRQ(ierr);
11874ed9c26SBarry Smith   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
11949b5e25fSSatish Balay   ssmap = smap;
120854ce69bSBarry Smith   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
12149b5e25fSSatish Balay   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
12249b5e25fSSatish Balay   /* determine lens of each row */
12349b5e25fSSatish Balay   for (i=0; i<nrows; i++) {
12449b5e25fSSatish Balay     kstart  = ai[irow[i]];
12549b5e25fSSatish Balay     kend    = kstart + a->ilen[irow[i]];
12649b5e25fSSatish Balay     lens[i] = 0;
12749b5e25fSSatish Balay     for (k=kstart; k<kend; k++) {
12826fbe8dcSKarl Rupp       if (ssmap[aj[k]]) lens[i]++;
12949b5e25fSSatish Balay     }
13049b5e25fSSatish Balay   }
13149b5e25fSSatish Balay   /* Create and fill new matrix */
13249b5e25fSSatish Balay   if (scall == MAT_REUSE_MATRIX) {
13349b5e25fSSatish Balay     c = (Mat_SeqSBAIJ*)((*B)->data);
13449b5e25fSSatish Balay 
135e32f2f54SBarry Smith     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
13613f74950SBarry Smith     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
137e7e72b3dSBarry Smith     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
13813f74950SBarry Smith     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
13949b5e25fSSatish Balay     C    = *B;
14049b5e25fSSatish Balay   } else {
141ce94432eSBarry Smith     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
142f69a0ea3SMatthew Knepley     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1437adad957SLisandro Dalcin     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
144ab93d7beSBarry Smith     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
14549b5e25fSSatish Balay   }
14649b5e25fSSatish Balay   c = (Mat_SeqSBAIJ*)(C->data);
14749b5e25fSSatish Balay   for (i=0; i<nrows; i++) {
14849b5e25fSSatish Balay     row      = irow[i];
14949b5e25fSSatish Balay     kstart   = ai[row];
15049b5e25fSSatish Balay     kend     = kstart + a->ilen[row];
15149b5e25fSSatish Balay     mat_i    = c->i[i];
15249b5e25fSSatish Balay     mat_j    = c->j + mat_i;
15349b5e25fSSatish Balay     mat_a    = c->a + mat_i*bs2;
15449b5e25fSSatish Balay     mat_ilen = c->ilen + i;
15549b5e25fSSatish Balay     for (k=kstart; k<kend; k++) {
15649b5e25fSSatish Balay       if ((tcol=ssmap[a->j[k]])) {
15749b5e25fSSatish Balay         *mat_j++ = tcol - 1;
15849b5e25fSSatish Balay         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
15949b5e25fSSatish Balay         mat_a   += bs2;
16049b5e25fSSatish Balay         (*mat_ilen)++;
16149b5e25fSSatish Balay       }
16249b5e25fSSatish Balay     }
16349b5e25fSSatish Balay   }
16449b5e25fSSatish Balay 
16549b5e25fSSatish Balay   /* Free work space */
16649b5e25fSSatish Balay   ierr = PetscFree(smap);CHKERRQ(ierr);
16749b5e25fSSatish Balay   ierr = PetscFree(lens);CHKERRQ(ierr);
16849b5e25fSSatish Balay   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
16949b5e25fSSatish Balay   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
17049b5e25fSSatish Balay 
17149b5e25fSSatish Balay   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
17249b5e25fSSatish Balay   *B   = C;
17349b5e25fSSatish Balay   PetscFunctionReturn(0);
17449b5e25fSSatish Balay }
17549b5e25fSSatish Balay 
1764a2ae208SSatish Balay #undef __FUNCT__
1774a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
1784aa3045dSJed Brown PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
17949b5e25fSSatish Balay {
18049b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
18149b5e25fSSatish Balay   IS             is1;
1826849ba73SBarry Smith   PetscErrorCode ierr;
1835d0c19d7SBarry Smith   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
1845d0c19d7SBarry Smith   const PetscInt *irow;
18549b5e25fSSatish Balay 
18649b5e25fSSatish Balay   PetscFunctionBegin;
187*8f46ffcaSHong Zhang   if (isrow != iscol) {
188*8f46ffcaSHong Zhang     PetscBool isequal;
189*8f46ffcaSHong Zhang     ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
190*8f46ffcaSHong Zhang     if (!isequal)
191*8f46ffcaSHong Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isrow");
192*8f46ffcaSHong Zhang   }
19349b5e25fSSatish Balay 
194*8f46ffcaSHong Zhang   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); //isrow == iscol, so delete below!!!
19549b5e25fSSatish Balay   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
19649b5e25fSSatish Balay 
19749b5e25fSSatish Balay   /* Verify if the indices corespond to each element in a block
198*8f46ffcaSHong Zhang    and form the IS with compressed IS -- also sort it!!! */
199dcca6d9dSJed Brown   ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr);
20074ed9c26SBarry Smith   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
20149b5e25fSSatish Balay   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
20249b5e25fSSatish Balay 
20349b5e25fSSatish Balay   count = 0;
20449b5e25fSSatish Balay   for (i=0; i<a->mbs; i++) {
205e32f2f54SBarry Smith     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
20649b5e25fSSatish Balay     if (vary[i]==bs) iary[count++] = i;
20749b5e25fSSatish Balay   }
20849b5e25fSSatish Balay   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
20970b3c8c7SBarry Smith   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
21074ed9c26SBarry Smith   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
21149b5e25fSSatish Balay 
212*8f46ffcaSHong Zhang   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr); //remove a is1!!!
2136bf464f9SBarry Smith   ierr = ISDestroy(&is1);CHKERRQ(ierr);
21449b5e25fSSatish Balay   PetscFunctionReturn(0);
21549b5e25fSSatish Balay }
21649b5e25fSSatish Balay 
2174a2ae208SSatish Balay #undef __FUNCT__
2184a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
21913f74950SBarry Smith PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
22049b5e25fSSatish Balay {
2216849ba73SBarry Smith   PetscErrorCode ierr;
22213f74950SBarry Smith   PetscInt       i;
22349b5e25fSSatish Balay 
22449b5e25fSSatish Balay   PetscFunctionBegin;
22549b5e25fSSatish Balay   if (scall == MAT_INITIAL_MATRIX) {
226854ce69bSBarry Smith     ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr);
22749b5e25fSSatish Balay   }
22849b5e25fSSatish Balay 
22949b5e25fSSatish Balay   for (i=0; i<n; i++) {
2304aa3045dSJed Brown     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
23149b5e25fSSatish Balay   }
23249b5e25fSSatish Balay   PetscFunctionReturn(0);
23349b5e25fSSatish Balay }
23449b5e25fSSatish Balay 
23549b5e25fSSatish Balay /* -------------------------------------------------------*/
23649b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */
23749b5e25fSSatish Balay /* -------------------------------------------------------*/
23849b5e25fSSatish Balay 
2394a2ae208SSatish Balay #undef __FUNCT__
2404a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_2"
241dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
24249b5e25fSSatish Balay {
24349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
244d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,zero=0.0;
245d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
246d9ca1df4SBarry Smith   const MatScalar   *v;
2476849ba73SBarry Smith   PetscErrorCode    ierr;
248d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
249d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
25098c9bda7SSatish Balay   PetscInt          nonzerorow=0;
25149b5e25fSSatish Balay 
25249b5e25fSSatish Balay   PetscFunctionBegin;
2532dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
254d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2551ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
25649b5e25fSSatish Balay 
25749b5e25fSSatish Balay   v  = a->a;
25849b5e25fSSatish Balay   xb = x;
25949b5e25fSSatish Balay 
26049b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
26149b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
26249b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
26349b5e25fSSatish Balay     ib          = aj + *ai;
264831a3094SHong Zhang     jmin        = 0;
26598c9bda7SSatish Balay     nonzerorow += (n>0);
2667fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
26749b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
26849b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
269831a3094SHong Zhang       v        += 4; jmin++;
2707fbae186SHong Zhang     }
271444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
272444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
273831a3094SHong Zhang     for (j=jmin; j<n; j++) {
27449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
27549b5e25fSSatish Balay       cval       = ib[j]*2;
27649b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
27749b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
27849b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
27949b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
28049b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
28149b5e25fSSatish Balay       v        += 4;
28249b5e25fSSatish Balay     }
28349b5e25fSSatish Balay     xb +=2; ai++;
28449b5e25fSSatish Balay   }
28549b5e25fSSatish Balay 
286d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2871ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
288dc0b31edSSatish Balay   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
28949b5e25fSSatish Balay   PetscFunctionReturn(0);
29049b5e25fSSatish Balay }
29149b5e25fSSatish Balay 
2924a2ae208SSatish Balay #undef __FUNCT__
2934a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_3"
294dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
29549b5e25fSSatish Balay {
29649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
297d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,zero=0.0;
298d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
299d9ca1df4SBarry Smith   const MatScalar   *v;
3006849ba73SBarry Smith   PetscErrorCode    ierr;
301d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
302d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
30398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
30449b5e25fSSatish Balay 
30549b5e25fSSatish Balay   PetscFunctionBegin;
3062dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
307d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3081ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
30949b5e25fSSatish Balay 
31049b5e25fSSatish Balay   v  = a->a;
31149b5e25fSSatish Balay   xb = x;
31249b5e25fSSatish Balay 
31349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
31449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
31549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
31649b5e25fSSatish Balay     ib          = aj + *ai;
317831a3094SHong Zhang     jmin        = 0;
31898c9bda7SSatish Balay     nonzerorow += (n>0);
3197fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
32049b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
32149b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
32249b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
323831a3094SHong Zhang       v        += 9; jmin++;
3247fbae186SHong Zhang     }
325444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
326444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
327831a3094SHong Zhang     for (j=jmin; j<n; j++) {
32849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
32949b5e25fSSatish Balay       cval       = ib[j]*3;
33049b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
33149b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
33249b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
33349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
33449b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
33549b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
33649b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
33749b5e25fSSatish Balay       v        += 9;
33849b5e25fSSatish Balay     }
33949b5e25fSSatish Balay     xb +=3; ai++;
34049b5e25fSSatish Balay   }
34149b5e25fSSatish Balay 
342d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
3431ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
344dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
34549b5e25fSSatish Balay   PetscFunctionReturn(0);
34649b5e25fSSatish Balay }
34749b5e25fSSatish Balay 
3484a2ae208SSatish Balay #undef __FUNCT__
3494a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_4"
350dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
35149b5e25fSSatish Balay {
35249b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
353d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
354d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
355d9ca1df4SBarry Smith   const MatScalar   *v;
3566849ba73SBarry Smith   PetscErrorCode    ierr;
357d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
358d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
35998c9bda7SSatish Balay   PetscInt          nonzerorow = 0;
36049b5e25fSSatish Balay 
36149b5e25fSSatish Balay   PetscFunctionBegin;
3622dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
363d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3641ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
36549b5e25fSSatish Balay 
36649b5e25fSSatish Balay   v  = a->a;
36749b5e25fSSatish Balay   xb = x;
36849b5e25fSSatish Balay 
36949b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
37049b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
37149b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
37249b5e25fSSatish Balay     ib          = aj + *ai;
373831a3094SHong Zhang     jmin        = 0;
37498c9bda7SSatish Balay     nonzerorow += (n>0);
3757fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
37649b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
37749b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
37849b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
37949b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
380831a3094SHong Zhang       v        += 16; jmin++;
3817fbae186SHong Zhang     }
382444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
383444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
384831a3094SHong Zhang     for (j=jmin; j<n; j++) {
38549b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
38649b5e25fSSatish Balay       cval       = ib[j]*4;
38749b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
38849b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
38949b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
39049b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
39149b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
39249b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
39349b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
39449b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
39549b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
39649b5e25fSSatish Balay       v        += 16;
39749b5e25fSSatish Balay     }
39849b5e25fSSatish Balay     xb +=4; ai++;
39949b5e25fSSatish Balay   }
40049b5e25fSSatish Balay 
401d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
4021ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
403dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
40449b5e25fSSatish Balay   PetscFunctionReturn(0);
40549b5e25fSSatish Balay }
40649b5e25fSSatish Balay 
4074a2ae208SSatish Balay #undef __FUNCT__
4084a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_5"
409dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
41049b5e25fSSatish Balay {
41149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
412d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
413d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
414d9ca1df4SBarry Smith   const MatScalar   *v;
4156849ba73SBarry Smith   PetscErrorCode    ierr;
416d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
417d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
41898c9bda7SSatish Balay   PetscInt          nonzerorow=0;
41949b5e25fSSatish Balay 
42049b5e25fSSatish Balay   PetscFunctionBegin;
4212dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
422d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4231ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
42449b5e25fSSatish Balay 
42549b5e25fSSatish Balay   v  = a->a;
42649b5e25fSSatish Balay   xb = x;
42749b5e25fSSatish Balay 
42849b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
42949b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
43049b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
43149b5e25fSSatish Balay     ib          = aj + *ai;
432831a3094SHong Zhang     jmin        = 0;
43398c9bda7SSatish Balay     nonzerorow += (n>0);
4347fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
43549b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
43649b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
43749b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
43849b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
43949b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
440831a3094SHong Zhang       v        += 25; jmin++;
4417fbae186SHong Zhang     }
442444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
443444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
444831a3094SHong Zhang     for (j=jmin; j<n; j++) {
44549b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
44649b5e25fSSatish Balay       cval       = ib[j]*5;
44749b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
44849b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
44949b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
45049b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
45149b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
45249b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
45349b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
45449b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
45549b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
45649b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
45749b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
45849b5e25fSSatish Balay       v        += 25;
45949b5e25fSSatish Balay     }
46049b5e25fSSatish Balay     xb +=5; ai++;
46149b5e25fSSatish Balay   }
46249b5e25fSSatish Balay 
463d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
4641ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
465dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
46649b5e25fSSatish Balay   PetscFunctionReturn(0);
46749b5e25fSSatish Balay }
46849b5e25fSSatish Balay 
46949b5e25fSSatish Balay 
4704a2ae208SSatish Balay #undef __FUNCT__
4714a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_6"
472dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
47349b5e25fSSatish Balay {
47449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
475d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
476d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
477d9ca1df4SBarry Smith   const MatScalar   *v;
4786849ba73SBarry Smith   PetscErrorCode    ierr;
479d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
480d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
48198c9bda7SSatish Balay   PetscInt          nonzerorow=0;
48249b5e25fSSatish Balay 
48349b5e25fSSatish Balay   PetscFunctionBegin;
4842dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
485d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4861ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
48749b5e25fSSatish Balay 
48849b5e25fSSatish Balay   v  = a->a;
48949b5e25fSSatish Balay   xb = x;
49049b5e25fSSatish Balay 
49149b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
49249b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
49349b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
49449b5e25fSSatish Balay     ib          = aj + *ai;
495831a3094SHong Zhang     jmin        = 0;
49698c9bda7SSatish Balay     nonzerorow += (n>0);
4977fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
49849b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
49949b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
50049b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
50149b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
50249b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
50349b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
504831a3094SHong Zhang       v        += 36; jmin++;
5057fbae186SHong Zhang     }
506444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
507444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
508831a3094SHong Zhang     for (j=jmin; j<n; j++) {
50949b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
51049b5e25fSSatish Balay       cval       = ib[j]*6;
51149b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
51249b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
51349b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
51449b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
51549b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
51649b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
51749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
51849b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
51949b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
52049b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
52149b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
52249b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
52349b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
52449b5e25fSSatish Balay       v        += 36;
52549b5e25fSSatish Balay     }
52649b5e25fSSatish Balay     xb +=6; ai++;
52749b5e25fSSatish Balay   }
52849b5e25fSSatish Balay 
529d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5301ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
531dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
53249b5e25fSSatish Balay   PetscFunctionReturn(0);
53349b5e25fSSatish Balay }
5344a2ae208SSatish Balay #undef __FUNCT__
5354a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_7"
536dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
53749b5e25fSSatish Balay {
53849b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
539d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
540d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
541d9ca1df4SBarry Smith   const MatScalar   *v;
5426849ba73SBarry Smith   PetscErrorCode    ierr;
543d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
544d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
54598c9bda7SSatish Balay   PetscInt          nonzerorow=0;
54649b5e25fSSatish Balay 
54749b5e25fSSatish Balay   PetscFunctionBegin;
5482dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
549d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
5501ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
55149b5e25fSSatish Balay 
55249b5e25fSSatish Balay   v  = a->a;
55349b5e25fSSatish Balay   xb = x;
55449b5e25fSSatish Balay 
55549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
55649b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
55749b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
55849b5e25fSSatish Balay     ib          = aj + *ai;
559831a3094SHong Zhang     jmin        = 0;
56098c9bda7SSatish Balay     nonzerorow += (n>0);
5617fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
56249b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
56349b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
56449b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
56549b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
56649b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
56749b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
56849b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
569831a3094SHong Zhang       v        += 49; jmin++;
5707fbae186SHong Zhang     }
571444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
572444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
573831a3094SHong Zhang     for (j=jmin; j<n; j++) {
57449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
57549b5e25fSSatish Balay       cval       = ib[j]*7;
57649b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
57749b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
57849b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
57949b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
58049b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
58149b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
58249b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
58349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
58449b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
58549b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
58649b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
58749b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
58849b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
58949b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
59049b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
59149b5e25fSSatish Balay       v       += 49;
59249b5e25fSSatish Balay     }
59349b5e25fSSatish Balay     xb +=7; ai++;
59449b5e25fSSatish Balay   }
595d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5961ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
597dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
59849b5e25fSSatish Balay   PetscFunctionReturn(0);
59949b5e25fSSatish Balay }
60049b5e25fSSatish Balay 
60149b5e25fSSatish Balay /*
60249b5e25fSSatish Balay     This will not work with MatScalar == float because it calls the BLAS
60349b5e25fSSatish Balay */
6044a2ae208SSatish Balay #undef __FUNCT__
6054a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_N"
606dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
60749b5e25fSSatish Balay {
60849b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
609d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
610d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
611d9ca1df4SBarry Smith   const MatScalar   *v;
612dfbe8321SBarry Smith   PetscErrorCode    ierr;
613d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
614d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
61598c9bda7SSatish Balay   PetscInt          nonzerorow=0;
61649b5e25fSSatish Balay 
61749b5e25fSSatish Balay   PetscFunctionBegin;
6182dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
619d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x;
6201ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
62149b5e25fSSatish Balay 
62249b5e25fSSatish Balay   aj = a->j;
62349b5e25fSSatish Balay   v  = a->a;
62449b5e25fSSatish Balay   ii = a->i;
62549b5e25fSSatish Balay 
62649b5e25fSSatish Balay   if (!a->mult_work) {
627854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
62849b5e25fSSatish Balay   }
62949b5e25fSSatish Balay   work = a->mult_work;
63049b5e25fSSatish Balay 
63149b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
63249b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
63349b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
63498c9bda7SSatish Balay     nonzerorow += (n>0);
63549b5e25fSSatish Balay 
63649b5e25fSSatish Balay     /* upper triangular part */
63749b5e25fSSatish Balay     for (j=0; j<n; j++) {
63849b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
63949b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
64049b5e25fSSatish Balay       workt += bs;
64149b5e25fSSatish Balay     }
64249b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
64396b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
64449b5e25fSSatish Balay 
64549b5e25fSSatish Balay     /* strict lower triangular part */
646831a3094SHong Zhang     idx = aj+ii[0];
647831a3094SHong Zhang     if (*idx == i) {
64896b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
649831a3094SHong Zhang     }
65096b9376eSHong Zhang 
65149b5e25fSSatish Balay     if (ncols > 0) {
65249b5e25fSSatish Balay       workt = work;
65387828ca2SBarry Smith       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
65496b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
655831a3094SHong Zhang       for (j=0; j<n; j++) {
656831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
65749b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
65849b5e25fSSatish Balay         workt += bs;
65949b5e25fSSatish Balay       }
66049b5e25fSSatish Balay     }
66149b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
66249b5e25fSSatish Balay   }
66349b5e25fSSatish Balay 
664d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
6651ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
666dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
66749b5e25fSSatish Balay   PetscFunctionReturn(0);
66849b5e25fSSatish Balay }
66949b5e25fSSatish Balay 
6704a2ae208SSatish Balay #undef __FUNCT__
6714a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
672dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
67349b5e25fSSatish Balay {
67449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
675d9ca1df4SBarry Smith   PetscScalar       *z,x1;
676d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
677d9ca1df4SBarry Smith   const MatScalar   *v;
6786849ba73SBarry Smith   PetscErrorCode    ierr;
679d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
680d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
68198c9bda7SSatish Balay   PetscInt          nonzerorow=0;
68249b5e25fSSatish Balay 
68349b5e25fSSatish Balay   PetscFunctionBegin;
684343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
685d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
6861ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
68749b5e25fSSatish Balay   v    = a->a;
68849b5e25fSSatish Balay   xb   = x;
68949b5e25fSSatish Balay 
69049b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
69149b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th row of A */
69249b5e25fSSatish Balay     x1          = xb[0];
69349b5e25fSSatish Balay     ib          = aj + *ai;
694831a3094SHong Zhang     jmin        = 0;
69598c9bda7SSatish Balay     nonzerorow += (n>0);
696831a3094SHong Zhang     if (*ib == i) {            /* (diag of A)*x */
697831a3094SHong Zhang       z[i] += *v++ * x[*ib++]; jmin++;
698831a3094SHong Zhang     }
699831a3094SHong Zhang     for (j=jmin; j<n; j++) {
70049b5e25fSSatish Balay       cval    = *ib;
70149b5e25fSSatish Balay       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
70249b5e25fSSatish Balay       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
70349b5e25fSSatish Balay     }
70449b5e25fSSatish Balay     xb++; ai++;
70549b5e25fSSatish Balay   }
70649b5e25fSSatish Balay 
707d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
708bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
70949b5e25fSSatish Balay 
710dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
71149b5e25fSSatish Balay   PetscFunctionReturn(0);
71249b5e25fSSatish Balay }
71349b5e25fSSatish Balay 
7144a2ae208SSatish Balay #undef __FUNCT__
7154a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
716dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
71749b5e25fSSatish Balay {
71849b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
719d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2;
720d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
721d9ca1df4SBarry Smith   const MatScalar   *v;
7226849ba73SBarry Smith   PetscErrorCode    ierr;
723d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
724d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
72598c9bda7SSatish Balay   PetscInt          nonzerorow=0;
72649b5e25fSSatish Balay 
72749b5e25fSSatish Balay   PetscFunctionBegin;
728343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
729d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7301ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
73149b5e25fSSatish Balay 
73249b5e25fSSatish Balay   v  = a->a;
73349b5e25fSSatish Balay   xb = x;
73449b5e25fSSatish Balay 
73549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
73649b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
73749b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
73849b5e25fSSatish Balay     ib          = aj + *ai;
739831a3094SHong Zhang     jmin        = 0;
74098c9bda7SSatish Balay     nonzerorow += (n>0);
7417fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
74249b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
74349b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
744831a3094SHong Zhang       v        += 4; jmin++;
7457fbae186SHong Zhang     }
746444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
747444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
748831a3094SHong Zhang     for (j=jmin; j<n; j++) {
74949b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
75049b5e25fSSatish Balay       cval       = ib[j]*2;
75149b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
75249b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
75349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
75449b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
75549b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
75649b5e25fSSatish Balay       v        += 4;
75749b5e25fSSatish Balay     }
75849b5e25fSSatish Balay     xb +=2; ai++;
75949b5e25fSSatish Balay   }
760d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
761bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
76249b5e25fSSatish Balay 
763dc0b31edSSatish Balay   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
76449b5e25fSSatish Balay   PetscFunctionReturn(0);
76549b5e25fSSatish Balay }
76649b5e25fSSatish Balay 
7674a2ae208SSatish Balay #undef __FUNCT__
7684a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
769dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
77049b5e25fSSatish Balay {
77149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
772d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3;
773d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
774d9ca1df4SBarry Smith   const MatScalar   *v;
7756849ba73SBarry Smith   PetscErrorCode    ierr;
776d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
777d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
77898c9bda7SSatish Balay   PetscInt          nonzerorow=0;
77949b5e25fSSatish Balay 
78049b5e25fSSatish Balay   PetscFunctionBegin;
781343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
782d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7831ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
78449b5e25fSSatish Balay 
78549b5e25fSSatish Balay   v  = a->a;
78649b5e25fSSatish Balay   xb = x;
78749b5e25fSSatish Balay 
78849b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
78949b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
79049b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
79149b5e25fSSatish Balay     ib          = aj + *ai;
792831a3094SHong Zhang     jmin        = 0;
79398c9bda7SSatish Balay     nonzerorow += (n>0);
7947fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
79549b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
79649b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
79749b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
798831a3094SHong Zhang       v        += 9; jmin++;
7997fbae186SHong Zhang     }
800444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
801444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
802831a3094SHong Zhang     for (j=jmin; j<n; j++) {
80349b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
80449b5e25fSSatish Balay       cval       = ib[j]*3;
80549b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
80649b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
80749b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
80849b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
80949b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
81049b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
81149b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
81249b5e25fSSatish Balay       v        += 9;
81349b5e25fSSatish Balay     }
81449b5e25fSSatish Balay     xb +=3; ai++;
81549b5e25fSSatish Balay   }
81649b5e25fSSatish Balay 
817d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
818bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
81949b5e25fSSatish Balay 
820dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
82149b5e25fSSatish Balay   PetscFunctionReturn(0);
82249b5e25fSSatish Balay }
82349b5e25fSSatish Balay 
8244a2ae208SSatish Balay #undef __FUNCT__
8254a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
826dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
82749b5e25fSSatish Balay {
82849b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
829d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4;
830d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
831d9ca1df4SBarry Smith   const MatScalar   *v;
8326849ba73SBarry Smith   PetscErrorCode    ierr;
833d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
834d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
83598c9bda7SSatish Balay   PetscInt          nonzerorow=0;
83649b5e25fSSatish Balay 
83749b5e25fSSatish Balay   PetscFunctionBegin;
838343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
839d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8401ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
84149b5e25fSSatish Balay 
84249b5e25fSSatish Balay   v  = a->a;
84349b5e25fSSatish Balay   xb = x;
84449b5e25fSSatish Balay 
84549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
84649b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
84749b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
84849b5e25fSSatish Balay     ib          = aj + *ai;
849831a3094SHong Zhang     jmin        = 0;
85098c9bda7SSatish Balay     nonzerorow += (n>0);
8517fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
85249b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
85349b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
85449b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
85549b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
856831a3094SHong Zhang       v        += 16; jmin++;
8577fbae186SHong Zhang     }
858444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
859444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
860831a3094SHong Zhang     for (j=jmin; j<n; j++) {
86149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
86249b5e25fSSatish Balay       cval       = ib[j]*4;
86349b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
86449b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
86549b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
86649b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
86749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
86849b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
86949b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
87049b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
87149b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
87249b5e25fSSatish Balay       v        += 16;
87349b5e25fSSatish Balay     }
87449b5e25fSSatish Balay     xb +=4; ai++;
87549b5e25fSSatish Balay   }
87649b5e25fSSatish Balay 
877d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
878bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
87949b5e25fSSatish Balay 
880dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
88149b5e25fSSatish Balay   PetscFunctionReturn(0);
88249b5e25fSSatish Balay }
88349b5e25fSSatish Balay 
8844a2ae208SSatish Balay #undef __FUNCT__
8854a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
886dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
88749b5e25fSSatish Balay {
88849b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
889d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5;
890d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
891d9ca1df4SBarry Smith   const MatScalar   *v;
8926849ba73SBarry Smith   PetscErrorCode    ierr;
893d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
894d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
89598c9bda7SSatish Balay   PetscInt          nonzerorow=0;
89649b5e25fSSatish Balay 
89749b5e25fSSatish Balay   PetscFunctionBegin;
898343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
899d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
9001ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
90149b5e25fSSatish Balay 
90249b5e25fSSatish Balay   v  = a->a;
90349b5e25fSSatish Balay   xb = x;
90449b5e25fSSatish Balay 
90549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
90649b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
90749b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
90849b5e25fSSatish Balay     ib          = aj + *ai;
909831a3094SHong Zhang     jmin        = 0;
91098c9bda7SSatish Balay     nonzerorow += (n>0);
9117fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
91249b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
91349b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
91449b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
91549b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
91649b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
917831a3094SHong Zhang       v        += 25; jmin++;
9187fbae186SHong Zhang     }
919444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
920444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
921831a3094SHong Zhang     for (j=jmin; j<n; j++) {
92249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
92349b5e25fSSatish Balay       cval       = ib[j]*5;
92449b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
92549b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
92649b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
92749b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
92849b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
92949b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
93049b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
93149b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
93249b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
93349b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
93449b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
93549b5e25fSSatish Balay       v        += 25;
93649b5e25fSSatish Balay     }
93749b5e25fSSatish Balay     xb +=5; ai++;
93849b5e25fSSatish Balay   }
93949b5e25fSSatish Balay 
940d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
941bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
94249b5e25fSSatish Balay 
943dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
94449b5e25fSSatish Balay   PetscFunctionReturn(0);
94549b5e25fSSatish Balay }
9464a2ae208SSatish Balay #undef __FUNCT__
9474a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
948dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
94949b5e25fSSatish Balay {
95049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
951d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
952d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
953d9ca1df4SBarry Smith   const MatScalar   *v;
9546849ba73SBarry Smith   PetscErrorCode    ierr;
955d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
956d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
95798c9bda7SSatish Balay   PetscInt          nonzerorow=0;
95849b5e25fSSatish Balay 
95949b5e25fSSatish Balay   PetscFunctionBegin;
960343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
961d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
9621ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
96349b5e25fSSatish Balay 
96449b5e25fSSatish Balay   v  = a->a;
96549b5e25fSSatish Balay   xb = x;
96649b5e25fSSatish Balay 
96749b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
96849b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
96949b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
97049b5e25fSSatish Balay     ib          = aj + *ai;
971831a3094SHong Zhang     jmin        = 0;
97298c9bda7SSatish Balay     nonzerorow += (n>0);
9737fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
97449b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
97549b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
97649b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
97749b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
97849b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
97949b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
980831a3094SHong Zhang       v        += 36; jmin++;
9817fbae186SHong Zhang     }
982444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
983444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
984831a3094SHong Zhang     for (j=jmin; j<n; j++) {
98549b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
98649b5e25fSSatish Balay       cval       = ib[j]*6;
98749b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
98849b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
98949b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
99049b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
99149b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
99249b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
99349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
99449b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
99549b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
99649b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
99749b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
99849b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
99949b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
100049b5e25fSSatish Balay       v        += 36;
100149b5e25fSSatish Balay     }
100249b5e25fSSatish Balay     xb +=6; ai++;
100349b5e25fSSatish Balay   }
100449b5e25fSSatish Balay 
1005d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1006bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
100749b5e25fSSatish Balay 
1008dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
100949b5e25fSSatish Balay   PetscFunctionReturn(0);
101049b5e25fSSatish Balay }
101149b5e25fSSatish Balay 
10124a2ae208SSatish Balay #undef __FUNCT__
10134a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1014dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
101549b5e25fSSatish Balay {
101649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1017d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1018d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
1019d9ca1df4SBarry Smith   const MatScalar   *v;
10206849ba73SBarry Smith   PetscErrorCode    ierr;
1021d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1022d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
102398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
102449b5e25fSSatish Balay 
102549b5e25fSSatish Balay   PetscFunctionBegin;
1026343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1027d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
10281ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
102949b5e25fSSatish Balay 
103049b5e25fSSatish Balay   v  = a->a;
103149b5e25fSSatish Balay   xb = x;
103249b5e25fSSatish Balay 
103349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
103449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
103549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
103649b5e25fSSatish Balay     ib          = aj + *ai;
1037831a3094SHong Zhang     jmin        = 0;
103898c9bda7SSatish Balay     nonzerorow += (n>0);
10397fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
104049b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
104149b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
104249b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
104349b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
104449b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
104549b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
104649b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1047831a3094SHong Zhang       v        += 49; jmin++;
10487fbae186SHong Zhang     }
1049444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1050444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1051831a3094SHong Zhang     for (j=jmin; j<n; j++) {
105249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
105349b5e25fSSatish Balay       cval       = ib[j]*7;
105449b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
105549b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
105649b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
105749b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
105849b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
105949b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
106049b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
106149b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
106249b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
106349b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
106449b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
106549b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
106649b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
106749b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
106849b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
106949b5e25fSSatish Balay       v       += 49;
107049b5e25fSSatish Balay     }
107149b5e25fSSatish Balay     xb +=7; ai++;
107249b5e25fSSatish Balay   }
107349b5e25fSSatish Balay 
1074d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1075bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
107649b5e25fSSatish Balay 
1077dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
107849b5e25fSSatish Balay   PetscFunctionReturn(0);
107949b5e25fSSatish Balay }
108049b5e25fSSatish Balay 
10814a2ae208SSatish Balay #undef __FUNCT__
10824a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1083dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
108449b5e25fSSatish Balay {
108549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1086d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1087d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
1088d9ca1df4SBarry Smith   const MatScalar   *v;
1089dfbe8321SBarry Smith   PetscErrorCode    ierr;
1090d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1091d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
109298c9bda7SSatish Balay   PetscInt          nonzerorow=0;
109349b5e25fSSatish Balay 
109449b5e25fSSatish Balay   PetscFunctionBegin;
1095343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1096d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
10971ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
109849b5e25fSSatish Balay 
109949b5e25fSSatish Balay   aj = a->j;
110049b5e25fSSatish Balay   v  = a->a;
110149b5e25fSSatish Balay   ii = a->i;
110249b5e25fSSatish Balay 
110349b5e25fSSatish Balay   if (!a->mult_work) {
1104854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
110549b5e25fSSatish Balay   }
110649b5e25fSSatish Balay   work = a->mult_work;
110749b5e25fSSatish Balay 
110849b5e25fSSatish Balay 
110949b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
111049b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
111149b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
111298c9bda7SSatish Balay     nonzerorow += (n>0);
111349b5e25fSSatish Balay 
111449b5e25fSSatish Balay     /* upper triangular part */
111549b5e25fSSatish Balay     for (j=0; j<n; j++) {
111649b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
111749b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
111849b5e25fSSatish Balay       workt += bs;
111949b5e25fSSatish Balay     }
112049b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
112196b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
112249b5e25fSSatish Balay 
112349b5e25fSSatish Balay     /* strict lower triangular part */
1124831a3094SHong Zhang     idx = aj+ii[0];
1125831a3094SHong Zhang     if (*idx == i) {
112696b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
1127831a3094SHong Zhang     }
112849b5e25fSSatish Balay     if (ncols > 0) {
112949b5e25fSSatish Balay       workt = work;
113087828ca2SBarry Smith       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
113196b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1132831a3094SHong Zhang       for (j=0; j<n; j++) {
1133831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
113449b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
113549b5e25fSSatish Balay         workt += bs;
113649b5e25fSSatish Balay       }
113749b5e25fSSatish Balay     }
113849b5e25fSSatish Balay 
113949b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
114049b5e25fSSatish Balay   }
114149b5e25fSSatish Balay 
1142d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1143bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
114449b5e25fSSatish Balay 
1145dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
114649b5e25fSSatish Balay   PetscFunctionReturn(0);
114749b5e25fSSatish Balay }
114849b5e25fSSatish Balay 
11494a2ae208SSatish Balay #undef __FUNCT__
11504a2ae208SSatish Balay #define __FUNCT__ "MatScale_SeqSBAIJ"
1151f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
115249b5e25fSSatish Balay {
115349b5e25fSSatish Balay   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1154f4df32b1SMatthew Knepley   PetscScalar    oalpha = alpha;
1155efee365bSSatish Balay   PetscErrorCode ierr;
1156c5df96a5SBarry Smith   PetscBLASInt   one = 1,totalnz;
115749b5e25fSSatish Balay 
115849b5e25fSSatish Balay   PetscFunctionBegin;
1159c5df96a5SBarry Smith   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
11608b83055fSJed Brown   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1161efee365bSSatish Balay   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
116249b5e25fSSatish Balay   PetscFunctionReturn(0);
116349b5e25fSSatish Balay }
116449b5e25fSSatish Balay 
11654a2ae208SSatish Balay #undef __FUNCT__
11664a2ae208SSatish Balay #define __FUNCT__ "MatNorm_SeqSBAIJ"
1167dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
116849b5e25fSSatish Balay {
116949b5e25fSSatish Balay   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1170d9ca1df4SBarry Smith   const MatScalar *v       = a->a;
117149b5e25fSSatish Balay   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1172d9ca1df4SBarry Smith   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
11736849ba73SBarry Smith   PetscErrorCode  ierr;
1174d9ca1df4SBarry Smith   const PetscInt  *aj=a->j,*col;
117549b5e25fSSatish Balay 
117649b5e25fSSatish Balay   PetscFunctionBegin;
117749b5e25fSSatish Balay   if (type == NORM_FROBENIUS) {
117849b5e25fSSatish Balay     for (k=0; k<mbs; k++) {
117949b5e25fSSatish Balay       jmin = a->i[k]; jmax = a->i[k+1];
1180831a3094SHong Zhang       col  = aj + jmin;
1181831a3094SHong Zhang       if (*col == k) {         /* diagonal block */
118249b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
118349b5e25fSSatish Balay           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
118449b5e25fSSatish Balay         }
1185831a3094SHong Zhang         jmin++;
1186831a3094SHong Zhang       }
1187831a3094SHong Zhang       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
118849b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
118949b5e25fSSatish Balay           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
119049b5e25fSSatish Balay         }
119149b5e25fSSatish Balay       }
119249b5e25fSSatish Balay     }
11938f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
11940b8dc8d2SHong Zhang   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1195dcca6d9dSJed Brown     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
11960b8dc8d2SHong Zhang     for (i=0; i<mbs; i++) jl[i] = mbs;
11970b8dc8d2SHong Zhang     il[0] = 0;
119849b5e25fSSatish Balay 
119949b5e25fSSatish Balay     *norm = 0.0;
120049b5e25fSSatish Balay     for (k=0; k<mbs; k++) { /* k_th block row */
120149b5e25fSSatish Balay       for (j=0; j<bs; j++) sum[j]=0.0;
120249b5e25fSSatish Balay       /*-- col sum --*/
120349b5e25fSSatish Balay       i = jl[k]; /* first |A(i,k)| to be added */
1204831a3094SHong Zhang       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1205831a3094SHong Zhang                   at step k */
120649b5e25fSSatish Balay       while (i<mbs) {
120749b5e25fSSatish Balay         nexti = jl[i];  /* next block row to be added */
120849b5e25fSSatish Balay         ik    = il[i];  /* block index of A(i,k) in the array a */
120949b5e25fSSatish Balay         for (j=0; j<bs; j++) {
121049b5e25fSSatish Balay           v = a->a + ik*bs2 + j*bs;
121149b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
121249b5e25fSSatish Balay             sum[j] += PetscAbsScalar(*v); v++;
121349b5e25fSSatish Balay           }
121449b5e25fSSatish Balay         }
121549b5e25fSSatish Balay         /* update il, jl */
1216831a3094SHong Zhang         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1217831a3094SHong Zhang         jmax = a->i[i+1];
121849b5e25fSSatish Balay         if (jmin < jmax) {
121949b5e25fSSatish Balay           il[i] = jmin;
122049b5e25fSSatish Balay           j     = a->j[jmin];
122149b5e25fSSatish Balay           jl[i] = jl[j]; jl[j]=i;
122249b5e25fSSatish Balay         }
122349b5e25fSSatish Balay         i = nexti;
122449b5e25fSSatish Balay       }
122549b5e25fSSatish Balay       /*-- row sum --*/
122649b5e25fSSatish Balay       jmin = a->i[k]; jmax = a->i[k+1];
122749b5e25fSSatish Balay       for (i=jmin; i<jmax; i++) {
122849b5e25fSSatish Balay         for (j=0; j<bs; j++) {
122949b5e25fSSatish Balay           v = a->a + i*bs2 + j;
123049b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
12310b8dc8d2SHong Zhang             sum[j] += PetscAbsScalar(*v); v += bs;
123249b5e25fSSatish Balay           }
123349b5e25fSSatish Balay         }
123449b5e25fSSatish Balay       }
123549b5e25fSSatish Balay       /* add k_th block row to il, jl */
1236831a3094SHong Zhang       col = aj+jmin;
1237831a3094SHong Zhang       if (*col == k) jmin++;
123849b5e25fSSatish Balay       if (jmin < jmax) {
123949b5e25fSSatish Balay         il[k] = jmin;
12400b8dc8d2SHong Zhang         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
124149b5e25fSSatish Balay       }
124249b5e25fSSatish Balay       for (j=0; j<bs; j++) {
124349b5e25fSSatish Balay         if (sum[j] > *norm) *norm = sum[j];
124449b5e25fSSatish Balay       }
124549b5e25fSSatish Balay     }
124674ed9c26SBarry Smith     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1247f23aa3ddSBarry Smith   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
124849b5e25fSSatish Balay   PetscFunctionReturn(0);
124949b5e25fSSatish Balay }
125049b5e25fSSatish Balay 
12514a2ae208SSatish Balay #undef __FUNCT__
12524a2ae208SSatish Balay #define __FUNCT__ "MatEqual_SeqSBAIJ"
1253ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
125449b5e25fSSatish Balay {
125549b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1256dfbe8321SBarry Smith   PetscErrorCode ierr;
125749b5e25fSSatish Balay 
125849b5e25fSSatish Balay   PetscFunctionBegin;
125949b5e25fSSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1260d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1261ef511fbeSHong Zhang     *flg = PETSC_FALSE;
1262ef511fbeSHong Zhang     PetscFunctionReturn(0);
126349b5e25fSSatish Balay   }
126449b5e25fSSatish Balay 
126549b5e25fSSatish Balay   /* if the a->i are the same */
126613f74950SBarry Smith   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
126726fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
126849b5e25fSSatish Balay 
126949b5e25fSSatish Balay   /* if a->j are the same */
127013f74950SBarry Smith   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
127126fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
127226fbe8dcSKarl Rupp 
127349b5e25fSSatish Balay   /* if a->a are the same */
1274d0f46423SBarry Smith   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1275935af2e7SHong Zhang   PetscFunctionReturn(0);
127649b5e25fSSatish Balay }
127749b5e25fSSatish Balay 
12784a2ae208SSatish Balay #undef __FUNCT__
12794a2ae208SSatish Balay #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1280dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
128149b5e25fSSatish Balay {
128249b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1283dfbe8321SBarry Smith   PetscErrorCode  ierr;
1284d9ca1df4SBarry Smith   PetscInt        i,j,k,row,bs,ambs,bs2;
1285d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
128687828ca2SBarry Smith   PetscScalar     *x,zero = 0.0;
1287d9ca1df4SBarry Smith   const MatScalar *aa,*aa_j;
128849b5e25fSSatish Balay 
128949b5e25fSSatish Balay   PetscFunctionBegin;
1290d0f46423SBarry Smith   bs = A->rmap->bs;
1291e32f2f54SBarry Smith   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
129282799104SHong Zhang 
129349b5e25fSSatish Balay   aa   = a->a;
12948a0c6efdSHong Zhang   ambs = a->mbs;
12958a0c6efdSHong Zhang 
12968a0c6efdSHong Zhang   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
12978a0c6efdSHong Zhang     PetscInt *diag=a->diag;
12988a0c6efdSHong Zhang     aa   = a->a;
12998a0c6efdSHong Zhang     ambs = a->mbs;
13008a0c6efdSHong Zhang     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
13018a0c6efdSHong Zhang     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
13028a0c6efdSHong Zhang     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
13038a0c6efdSHong Zhang     PetscFunctionReturn(0);
13048a0c6efdSHong Zhang   }
13058a0c6efdSHong Zhang 
130649b5e25fSSatish Balay   ai   = a->i;
130749b5e25fSSatish Balay   aj   = a->j;
130849b5e25fSSatish Balay   bs2  = a->bs2;
13092dcb1b2aSMatthew Knepley   ierr = VecSet(v,zero);CHKERRQ(ierr);
13101ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
131149b5e25fSSatish Balay   for (i=0; i<ambs; i++) {
131249b5e25fSSatish Balay     j=ai[i];
131349b5e25fSSatish Balay     if (aj[j] == i) {    /* if this is a diagonal element */
131449b5e25fSSatish Balay       row  = i*bs;
131549b5e25fSSatish Balay       aa_j = aa + j*bs2;
131649b5e25fSSatish Balay       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
131749b5e25fSSatish Balay     }
131849b5e25fSSatish Balay   }
13191ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
132049b5e25fSSatish Balay   PetscFunctionReturn(0);
132149b5e25fSSatish Balay }
132249b5e25fSSatish Balay 
13234a2ae208SSatish Balay #undef __FUNCT__
13244a2ae208SSatish Balay #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1325dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
132649b5e25fSSatish Balay {
132749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1328d9ca1df4SBarry Smith   PetscScalar       x;
1329d9ca1df4SBarry Smith   const PetscScalar *l,*li,*ri;
133049b5e25fSSatish Balay   MatScalar         *aa,*v;
1331dfbe8321SBarry Smith   PetscErrorCode    ierr;
13325e90f9d9SHong Zhang   PetscInt          i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1333ace3abfcSBarry Smith   PetscBool         flg;
133449b5e25fSSatish Balay 
133549b5e25fSSatish Balay   PetscFunctionBegin;
1336b3bf805bSHong Zhang   if (ll != rr) {
1337b3bf805bSHong Zhang     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1338e7e72b3dSBarry Smith     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1339b3bf805bSHong Zhang   }
1340b3bf805bSHong Zhang   if (!ll) PetscFunctionReturn(0);
134149b5e25fSSatish Balay   ai  = a->i;
134249b5e25fSSatish Balay   aj  = a->j;
134349b5e25fSSatish Balay   aa  = a->a;
1344d0f46423SBarry Smith   m   = A->rmap->N;
1345d0f46423SBarry Smith   bs  = A->rmap->bs;
134649b5e25fSSatish Balay   mbs = a->mbs;
134749b5e25fSSatish Balay   bs2 = a->bs2;
134849b5e25fSSatish Balay 
1349d9ca1df4SBarry Smith   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
135049b5e25fSSatish Balay   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1351e32f2f54SBarry Smith   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
135249b5e25fSSatish Balay   for (i=0; i<mbs; i++) { /* for each block row */
135349b5e25fSSatish Balay     M  = ai[i+1] - ai[i];
135449b5e25fSSatish Balay     li = l + i*bs;
135549b5e25fSSatish Balay     v  = aa + bs2*ai[i];
135649b5e25fSSatish Balay     for (j=0; j<M; j++) { /* for each block */
135749b5e25fSSatish Balay       ri = l + bs*aj[ai[i]+j];
13585e90f9d9SHong Zhang       for (k=0; k<bs; k++) {
135949b5e25fSSatish Balay         x = ri[k];
136049b5e25fSSatish Balay         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
136149b5e25fSSatish Balay       }
136249b5e25fSSatish Balay     }
136349b5e25fSSatish Balay   }
1364d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1365dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
136649b5e25fSSatish Balay   PetscFunctionReturn(0);
136749b5e25fSSatish Balay }
136849b5e25fSSatish Balay 
13694a2ae208SSatish Balay #undef __FUNCT__
13704a2ae208SSatish Balay #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1371dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
137249b5e25fSSatish Balay {
137349b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
137449b5e25fSSatish Balay 
137549b5e25fSSatish Balay   PetscFunctionBegin;
137649b5e25fSSatish Balay   info->block_size   = a->bs2;
1377ceed8ce5SJed Brown   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
13786c6c5352SBarry Smith   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
137949b5e25fSSatish Balay   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
138049b5e25fSSatish Balay   info->assemblies   = A->num_ass;
13818e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
13827adad957SLisandro Dalcin   info->memory       = ((PetscObject)A)->mem;
1383d5f3da31SBarry Smith   if (A->factortype) {
138449b5e25fSSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
138549b5e25fSSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
138649b5e25fSSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
138749b5e25fSSatish Balay   } else {
138849b5e25fSSatish Balay     info->fill_ratio_given  = 0;
138949b5e25fSSatish Balay     info->fill_ratio_needed = 0;
139049b5e25fSSatish Balay     info->factor_mallocs    = 0;
139149b5e25fSSatish Balay   }
139249b5e25fSSatish Balay   PetscFunctionReturn(0);
139349b5e25fSSatish Balay }
139449b5e25fSSatish Balay 
139549b5e25fSSatish Balay 
13964a2ae208SSatish Balay #undef __FUNCT__
13974a2ae208SSatish Balay #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1398dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
139949b5e25fSSatish Balay {
140049b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1401dfbe8321SBarry Smith   PetscErrorCode ierr;
140249b5e25fSSatish Balay 
140349b5e25fSSatish Balay   PetscFunctionBegin;
140449b5e25fSSatish Balay   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
140549b5e25fSSatish Balay   PetscFunctionReturn(0);
140649b5e25fSSatish Balay }
1407dc354874SHong Zhang 
14084a2ae208SSatish Balay #undef __FUNCT__
1409985db425SBarry Smith #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1410985db425SBarry Smith /*
1411985db425SBarry Smith    This code does not work since it only checks the upper triangular part of
1412985db425SBarry Smith   the matrix. Hence it is not listed in the function table.
1413985db425SBarry Smith */
1414985db425SBarry Smith PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1415dc354874SHong Zhang {
1416dc354874SHong Zhang   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1417dfbe8321SBarry Smith   PetscErrorCode  ierr;
1418d9ca1df4SBarry Smith   PetscInt        i,j,n,row,col,bs,mbs;
1419d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
1420c3fca9a7SHong Zhang   PetscReal       atmp;
1421d9ca1df4SBarry Smith   const MatScalar *aa;
1422985db425SBarry Smith   PetscScalar     *x;
142313f74950SBarry Smith   PetscInt        ncols,brow,bcol,krow,kcol;
1424dc354874SHong Zhang 
1425dc354874SHong Zhang   PetscFunctionBegin;
1426e32f2f54SBarry Smith   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1427e32f2f54SBarry Smith   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1428d0f46423SBarry Smith   bs  = A->rmap->bs;
1429dc354874SHong Zhang   aa  = a->a;
1430dc354874SHong Zhang   ai  = a->i;
1431dc354874SHong Zhang   aj  = a->j;
143244117c81SHong Zhang   mbs = a->mbs;
1433dc354874SHong Zhang 
1434985db425SBarry Smith   ierr = VecSet(v,0.0);CHKERRQ(ierr);
14351ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1436dc354874SHong Zhang   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1437e32f2f54SBarry Smith   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
143844117c81SHong Zhang   for (i=0; i<mbs; i++) {
1439d0f6400bSHong Zhang     ncols = ai[1] - ai[0]; ai++;
1440d0f6400bSHong Zhang     brow  = bs*i;
144144117c81SHong Zhang     for (j=0; j<ncols; j++) {
1442d0f6400bSHong Zhang       bcol = bs*(*aj);
144344117c81SHong Zhang       for (kcol=0; kcol<bs; kcol++) {
1444d0f6400bSHong Zhang         col = bcol + kcol;      /* col index */
144544117c81SHong Zhang         for (krow=0; krow<bs; krow++) {
1446d0f6400bSHong Zhang           atmp = PetscAbsScalar(*aa); aa++;
1447d0f6400bSHong Zhang           row  = brow + krow;   /* row index */
1448c3fca9a7SHong Zhang           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1449c3fca9a7SHong Zhang           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
145044117c81SHong Zhang         }
145144117c81SHong Zhang       }
1452d0f6400bSHong Zhang       aj++;
1453dc354874SHong Zhang     }
1454dc354874SHong Zhang   }
14551ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1456dc354874SHong Zhang   PetscFunctionReturn(0);
1457dc354874SHong Zhang }
1458