xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 55d14e7db00c55374b7a73614b4a2091d5afcedc)
149b5e25fSSatish Balay 
2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
306873bf2SBarry Smith #include <petsc-private/kernels/blockinvert.h>
4c6db04a5SJed Brown #include <petscbt.h>
5c6db04a5SJed Brown #include <../src/mat/impls/sbaij/seq/sbaij.h>
6c6db04a5SJed Brown #include <petscblaslapack.h>
749b5e25fSSatish Balay 
84a2ae208SSatish Balay #undef __FUNCT__
94a2ae208SSatish Balay #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
1013f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
1149b5e25fSSatish Balay {
125eee224dSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
136849ba73SBarry Smith   PetscErrorCode ierr;
145d0c19d7SBarry Smith   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
155d0c19d7SBarry Smith   const PetscInt *idx;
16db41cccfSHong Zhang   PetscBT        table_out,table_in;
17d94109b8SHong Zhang 
18d94109b8SHong Zhang   PetscFunctionBegin;
19e32f2f54SBarry Smith   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20d94109b8SHong Zhang   mbs  = a->mbs;
21d94109b8SHong Zhang   ai   = a->i;
22d94109b8SHong Zhang   aj   = a->j;
23d0f46423SBarry Smith   bs   = A->rmap->bs;
2453b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
25854ce69bSBarry Smith   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
26854ce69bSBarry Smith   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
2753b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
28d94109b8SHong Zhang 
29d94109b8SHong Zhang   for (i=0; i<is_max; i++) { /* for each is */
30d94109b8SHong Zhang     isz  = 0;
31db41cccfSHong Zhang     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
32d94109b8SHong Zhang 
33d94109b8SHong Zhang     /* Extract the indices, assume there can be duplicate entries */
34d94109b8SHong Zhang     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35d94109b8SHong Zhang     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36d94109b8SHong Zhang 
37db41cccfSHong Zhang     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
38dbe03f88SHong Zhang     bcol_max = 0;
39d94109b8SHong Zhang     for (j=0; j<n; ++j) {
40d94109b8SHong Zhang       brow = idx[j]/bs; /* convert the indices into block indices */
41e32f2f54SBarry Smith       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42db41cccfSHong Zhang       if (!PetscBTLookupSet(table_out,brow)) {
43dbe03f88SHong Zhang         nidx[isz++] = brow;
44dbe03f88SHong Zhang         if (bcol_max < brow) bcol_max = brow;
45dbe03f88SHong Zhang       }
46d94109b8SHong Zhang     }
47d94109b8SHong Zhang     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
486bf464f9SBarry Smith     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
49d94109b8SHong Zhang 
50d94109b8SHong Zhang     k = 0;
51d94109b8SHong Zhang     for (j=0; j<ov; j++) { /* for each overlap */
52db41cccfSHong Zhang       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53db41cccfSHong Zhang       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
54db41cccfSHong Zhang       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
55d94109b8SHong Zhang 
56d94109b8SHong Zhang       n = isz;  /* length of the updated is[i] */
57d94109b8SHong Zhang       for (brow=0; brow<mbs; brow++) {
58d94109b8SHong Zhang         start = ai[brow]; end   = ai[brow+1];
59db41cccfSHong Zhang         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
60d94109b8SHong Zhang           for (l = start; l<end; l++) {
61d94109b8SHong Zhang             bcol = aj[l];
62d7b97159SHong Zhang             if (!PetscBTLookupSet(table_out,bcol)) {
63d7b97159SHong Zhang               nidx[isz++] = bcol;
64d7b97159SHong Zhang               if (bcol_max < bcol) bcol_max = bcol;
65d7b97159SHong Zhang             }
66d94109b8SHong Zhang           }
67d94109b8SHong Zhang           k++;
68d94109b8SHong Zhang           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
69d94109b8SHong Zhang         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
70d94109b8SHong Zhang           for (l = start; l<end; l++) {
71d94109b8SHong Zhang             bcol = aj[l];
72dbe03f88SHong Zhang             if (bcol > bcol_max) break;
73db41cccfSHong Zhang             if (PetscBTLookup(table_in,bcol)) {
7426fbe8dcSKarl Rupp               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
75d94109b8SHong Zhang               break; /* for l = start; l<end ; l++) */
76d94109b8SHong Zhang             }
77d94109b8SHong Zhang           }
78d94109b8SHong Zhang         }
79d94109b8SHong Zhang       }
80d94109b8SHong Zhang     } /* for each overlap */
81d94109b8SHong Zhang 
82d94109b8SHong Zhang     /* expand the Index Set */
83d94109b8SHong Zhang     for (j=0; j<isz; j++) {
8426fbe8dcSKarl Rupp       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
85d94109b8SHong Zhang     }
8670b3c8c7SBarry Smith     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
87d94109b8SHong Zhang   }
8894bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
89d94109b8SHong Zhang   ierr = PetscFree(nidx);CHKERRQ(ierr);
90d94109b8SHong Zhang   ierr = PetscFree(nidx2);CHKERRQ(ierr);
9194bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
925eee224dSHong Zhang   PetscFunctionReturn(0);
9349b5e25fSSatish Balay }
9449b5e25fSSatish Balay 
954a2ae208SSatish Balay #undef __FUNCT__
964a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
97*55d14e7dSHong Zhang PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,MatReuse scall,Mat *B)
9849b5e25fSSatish Balay {
9949b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data,*c;
1006849ba73SBarry Smith   PetscErrorCode  ierr;
10113f74950SBarry Smith   PetscInt        *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
10213f74950SBarry Smith   PetscInt        row,mat_i,*mat_j,tcol,*mat_ilen;
1035d0c19d7SBarry Smith   PetscInt        nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
1045d0c19d7SBarry Smith   const PetscInt  *irow,*aj = a->j,*ai = a->i;
10549b5e25fSSatish Balay   MatScalar       *mat_a;
10649b5e25fSSatish Balay   Mat             C;
107ace3abfcSBarry Smith   PetscBool       flag,sorted;
10849b5e25fSSatish Balay 
10949b5e25fSSatish Balay   PetscFunctionBegin;
110*55d14e7dSHong Zhang   ierr = ISSorted(isrow,&sorted);CHKERRQ(ierr);
111*55d14e7dSHong Zhang   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
11249b5e25fSSatish Balay 
11349b5e25fSSatish Balay   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
11449b5e25fSSatish Balay   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
11549b5e25fSSatish Balay 
116785e854fSJed Brown   ierr  = PetscMalloc1(oldcols,&smap);CHKERRQ(ierr);
11774ed9c26SBarry Smith   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
11849b5e25fSSatish Balay   ssmap = smap;
119854ce69bSBarry Smith   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
12049b5e25fSSatish Balay   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
12149b5e25fSSatish Balay   /* determine lens of each row */
12249b5e25fSSatish Balay   for (i=0; i<nrows; i++) {
12349b5e25fSSatish Balay     kstart  = ai[irow[i]];
12449b5e25fSSatish Balay     kend    = kstart + a->ilen[irow[i]];
12549b5e25fSSatish Balay     lens[i] = 0;
12649b5e25fSSatish Balay     for (k=kstart; k<kend; k++) {
12726fbe8dcSKarl Rupp       if (ssmap[aj[k]]) lens[i]++;
12849b5e25fSSatish Balay     }
12949b5e25fSSatish Balay   }
13049b5e25fSSatish Balay   /* Create and fill new matrix */
13149b5e25fSSatish Balay   if (scall == MAT_REUSE_MATRIX) {
13249b5e25fSSatish Balay     c = (Mat_SeqSBAIJ*)((*B)->data);
13349b5e25fSSatish Balay 
134e32f2f54SBarry Smith     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
13513f74950SBarry Smith     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
136e7e72b3dSBarry Smith     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
13713f74950SBarry Smith     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
13849b5e25fSSatish Balay     C    = *B;
13949b5e25fSSatish Balay   } else {
140ce94432eSBarry Smith     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
141f69a0ea3SMatthew Knepley     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1427adad957SLisandro Dalcin     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
143ab93d7beSBarry Smith     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
14449b5e25fSSatish Balay   }
14549b5e25fSSatish Balay   c = (Mat_SeqSBAIJ*)(C->data);
14649b5e25fSSatish Balay   for (i=0; i<nrows; i++) {
14749b5e25fSSatish Balay     row      = irow[i];
14849b5e25fSSatish Balay     kstart   = ai[row];
14949b5e25fSSatish Balay     kend     = kstart + a->ilen[row];
15049b5e25fSSatish Balay     mat_i    = c->i[i];
15149b5e25fSSatish Balay     mat_j    = c->j + mat_i;
15249b5e25fSSatish Balay     mat_a    = c->a + mat_i*bs2;
15349b5e25fSSatish Balay     mat_ilen = c->ilen + i;
15449b5e25fSSatish Balay     for (k=kstart; k<kend; k++) {
15549b5e25fSSatish Balay       if ((tcol=ssmap[a->j[k]])) {
15649b5e25fSSatish Balay         *mat_j++ = tcol - 1;
15749b5e25fSSatish Balay         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
15849b5e25fSSatish Balay         mat_a   += bs2;
15949b5e25fSSatish Balay         (*mat_ilen)++;
16049b5e25fSSatish Balay       }
16149b5e25fSSatish Balay     }
16249b5e25fSSatish Balay   }
16349b5e25fSSatish Balay 
16449b5e25fSSatish Balay   /* Free work space */
16549b5e25fSSatish Balay   ierr = PetscFree(smap);CHKERRQ(ierr);
16649b5e25fSSatish Balay   ierr = PetscFree(lens);CHKERRQ(ierr);
16749b5e25fSSatish Balay   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
16849b5e25fSSatish Balay   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
16949b5e25fSSatish Balay 
17049b5e25fSSatish Balay   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
17149b5e25fSSatish Balay   *B   = C;
17249b5e25fSSatish Balay   PetscFunctionReturn(0);
17349b5e25fSSatish Balay }
17449b5e25fSSatish Balay 
1754a2ae208SSatish Balay #undef __FUNCT__
1764a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
1774aa3045dSJed Brown PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
17849b5e25fSSatish Balay {
17949b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
18049b5e25fSSatish Balay   IS             is1;
1816849ba73SBarry Smith   PetscErrorCode ierr;
1825d0c19d7SBarry Smith   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
1835d0c19d7SBarry Smith   const PetscInt *irow;
18449b5e25fSSatish Balay 
18549b5e25fSSatish Balay   PetscFunctionBegin;
1868f46ffcaSHong Zhang   if (isrow != iscol) {
1878f46ffcaSHong Zhang     PetscBool isequal;
1888f46ffcaSHong Zhang     ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
1898f46ffcaSHong Zhang     if (!isequal)
1908f46ffcaSHong Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isrow");
1918f46ffcaSHong Zhang   }
19249b5e25fSSatish Balay 
193*55d14e7dSHong Zhang   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
19449b5e25fSSatish Balay   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
19549b5e25fSSatish Balay 
19649b5e25fSSatish Balay   /* Verify if the indices corespond to each element in a block
197*55d14e7dSHong Zhang    and form the IS with compressed IS */
198dcca6d9dSJed Brown   ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr);
19974ed9c26SBarry Smith   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
20049b5e25fSSatish Balay   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
20149b5e25fSSatish Balay 
20249b5e25fSSatish Balay   count = 0;
20349b5e25fSSatish Balay   for (i=0; i<a->mbs; i++) {
204e32f2f54SBarry Smith     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
20549b5e25fSSatish Balay     if (vary[i]==bs) iary[count++] = i;
20649b5e25fSSatish Balay   }
20749b5e25fSSatish Balay   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
20870b3c8c7SBarry Smith   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
20974ed9c26SBarry Smith   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
21049b5e25fSSatish Balay 
211*55d14e7dSHong Zhang   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,scall,B);CHKERRQ(ierr);
2126bf464f9SBarry Smith   ierr = ISDestroy(&is1);CHKERRQ(ierr);
21349b5e25fSSatish Balay   PetscFunctionReturn(0);
21449b5e25fSSatish Balay }
21549b5e25fSSatish Balay 
2164a2ae208SSatish Balay #undef __FUNCT__
2174a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
21813f74950SBarry Smith PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
21949b5e25fSSatish Balay {
2206849ba73SBarry Smith   PetscErrorCode ierr;
22113f74950SBarry Smith   PetscInt       i;
22249b5e25fSSatish Balay 
22349b5e25fSSatish Balay   PetscFunctionBegin;
22449b5e25fSSatish Balay   if (scall == MAT_INITIAL_MATRIX) {
225854ce69bSBarry Smith     ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr);
22649b5e25fSSatish Balay   }
22749b5e25fSSatish Balay 
22849b5e25fSSatish Balay   for (i=0; i<n; i++) {
2294aa3045dSJed Brown     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
23049b5e25fSSatish Balay   }
23149b5e25fSSatish Balay   PetscFunctionReturn(0);
23249b5e25fSSatish Balay }
23349b5e25fSSatish Balay 
23449b5e25fSSatish Balay /* -------------------------------------------------------*/
23549b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */
23649b5e25fSSatish Balay /* -------------------------------------------------------*/
23749b5e25fSSatish Balay 
2384a2ae208SSatish Balay #undef __FUNCT__
2394a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_2"
240dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
24149b5e25fSSatish Balay {
24249b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
243d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,zero=0.0;
244d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
245d9ca1df4SBarry Smith   const MatScalar   *v;
2466849ba73SBarry Smith   PetscErrorCode    ierr;
247d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
248d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
24998c9bda7SSatish Balay   PetscInt          nonzerorow=0;
25049b5e25fSSatish Balay 
25149b5e25fSSatish Balay   PetscFunctionBegin;
2522dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
253d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2541ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
25549b5e25fSSatish Balay 
25649b5e25fSSatish Balay   v  = a->a;
25749b5e25fSSatish Balay   xb = x;
25849b5e25fSSatish Balay 
25949b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
26049b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
26149b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
26249b5e25fSSatish Balay     ib          = aj + *ai;
263831a3094SHong Zhang     jmin        = 0;
26498c9bda7SSatish Balay     nonzerorow += (n>0);
2657fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
26649b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
26749b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
268831a3094SHong Zhang       v        += 4; jmin++;
2697fbae186SHong Zhang     }
270444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
271444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
272831a3094SHong Zhang     for (j=jmin; j<n; j++) {
27349b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
27449b5e25fSSatish Balay       cval       = ib[j]*2;
27549b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
27649b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
27749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
27849b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
27949b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
28049b5e25fSSatish Balay       v        += 4;
28149b5e25fSSatish Balay     }
28249b5e25fSSatish Balay     xb +=2; ai++;
28349b5e25fSSatish Balay   }
28449b5e25fSSatish Balay 
285d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2861ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
287dc0b31edSSatish Balay   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
28849b5e25fSSatish Balay   PetscFunctionReturn(0);
28949b5e25fSSatish Balay }
29049b5e25fSSatish Balay 
2914a2ae208SSatish Balay #undef __FUNCT__
2924a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_3"
293dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
29449b5e25fSSatish Balay {
29549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
296d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,zero=0.0;
297d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
298d9ca1df4SBarry Smith   const MatScalar   *v;
2996849ba73SBarry Smith   PetscErrorCode    ierr;
300d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
301d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
30298c9bda7SSatish Balay   PetscInt          nonzerorow=0;
30349b5e25fSSatish Balay 
30449b5e25fSSatish Balay   PetscFunctionBegin;
3052dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
306d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3071ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
30849b5e25fSSatish Balay 
30949b5e25fSSatish Balay   v  = a->a;
31049b5e25fSSatish Balay   xb = x;
31149b5e25fSSatish Balay 
31249b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
31349b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
31449b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
31549b5e25fSSatish Balay     ib          = aj + *ai;
316831a3094SHong Zhang     jmin        = 0;
31798c9bda7SSatish Balay     nonzerorow += (n>0);
3187fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
31949b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
32049b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
32149b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
322831a3094SHong Zhang       v        += 9; jmin++;
3237fbae186SHong Zhang     }
324444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
325444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
326831a3094SHong Zhang     for (j=jmin; j<n; j++) {
32749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
32849b5e25fSSatish Balay       cval       = ib[j]*3;
32949b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
33049b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
33149b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
33249b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
33349b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
33449b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
33549b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
33649b5e25fSSatish Balay       v        += 9;
33749b5e25fSSatish Balay     }
33849b5e25fSSatish Balay     xb +=3; ai++;
33949b5e25fSSatish Balay   }
34049b5e25fSSatish Balay 
341d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
3421ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
343dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
34449b5e25fSSatish Balay   PetscFunctionReturn(0);
34549b5e25fSSatish Balay }
34649b5e25fSSatish Balay 
3474a2ae208SSatish Balay #undef __FUNCT__
3484a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_4"
349dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
35049b5e25fSSatish Balay {
35149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
352d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
353d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
354d9ca1df4SBarry Smith   const MatScalar   *v;
3556849ba73SBarry Smith   PetscErrorCode    ierr;
356d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
357d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
35898c9bda7SSatish Balay   PetscInt          nonzerorow = 0;
35949b5e25fSSatish Balay 
36049b5e25fSSatish Balay   PetscFunctionBegin;
3612dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
362d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3631ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
36449b5e25fSSatish Balay 
36549b5e25fSSatish Balay   v  = a->a;
36649b5e25fSSatish Balay   xb = x;
36749b5e25fSSatish Balay 
36849b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
36949b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
37049b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
37149b5e25fSSatish Balay     ib          = aj + *ai;
372831a3094SHong Zhang     jmin        = 0;
37398c9bda7SSatish Balay     nonzerorow += (n>0);
3747fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
37549b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
37649b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
37749b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
37849b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
379831a3094SHong Zhang       v        += 16; jmin++;
3807fbae186SHong Zhang     }
381444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
382444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
383831a3094SHong Zhang     for (j=jmin; j<n; j++) {
38449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
38549b5e25fSSatish Balay       cval       = ib[j]*4;
38649b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
38749b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
38849b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
38949b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
39049b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
39149b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
39249b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
39349b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
39449b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
39549b5e25fSSatish Balay       v        += 16;
39649b5e25fSSatish Balay     }
39749b5e25fSSatish Balay     xb +=4; ai++;
39849b5e25fSSatish Balay   }
39949b5e25fSSatish Balay 
400d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
4011ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
402dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
40349b5e25fSSatish Balay   PetscFunctionReturn(0);
40449b5e25fSSatish Balay }
40549b5e25fSSatish Balay 
4064a2ae208SSatish Balay #undef __FUNCT__
4074a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_5"
408dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
40949b5e25fSSatish Balay {
41049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
411d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
412d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
413d9ca1df4SBarry Smith   const MatScalar   *v;
4146849ba73SBarry Smith   PetscErrorCode    ierr;
415d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
416d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
41798c9bda7SSatish Balay   PetscInt          nonzerorow=0;
41849b5e25fSSatish Balay 
41949b5e25fSSatish Balay   PetscFunctionBegin;
4202dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
421d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4221ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
42349b5e25fSSatish Balay 
42449b5e25fSSatish Balay   v  = a->a;
42549b5e25fSSatish Balay   xb = x;
42649b5e25fSSatish Balay 
42749b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
42849b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
42949b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
43049b5e25fSSatish Balay     ib          = aj + *ai;
431831a3094SHong Zhang     jmin        = 0;
43298c9bda7SSatish Balay     nonzerorow += (n>0);
4337fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
43449b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
43549b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
43649b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
43749b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
43849b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
439831a3094SHong Zhang       v        += 25; jmin++;
4407fbae186SHong Zhang     }
441444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
442444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
443831a3094SHong Zhang     for (j=jmin; j<n; j++) {
44449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
44549b5e25fSSatish Balay       cval       = ib[j]*5;
44649b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
44749b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
44849b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
44949b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
45049b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
45149b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
45249b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
45349b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
45449b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
45549b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
45649b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
45749b5e25fSSatish Balay       v        += 25;
45849b5e25fSSatish Balay     }
45949b5e25fSSatish Balay     xb +=5; ai++;
46049b5e25fSSatish Balay   }
46149b5e25fSSatish Balay 
462d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
4631ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
464dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
46549b5e25fSSatish Balay   PetscFunctionReturn(0);
46649b5e25fSSatish Balay }
46749b5e25fSSatish Balay 
46849b5e25fSSatish Balay 
4694a2ae208SSatish Balay #undef __FUNCT__
4704a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_6"
471dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
47249b5e25fSSatish Balay {
47349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
474d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
475d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
476d9ca1df4SBarry Smith   const MatScalar   *v;
4776849ba73SBarry Smith   PetscErrorCode    ierr;
478d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
479d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
48098c9bda7SSatish Balay   PetscInt          nonzerorow=0;
48149b5e25fSSatish Balay 
48249b5e25fSSatish Balay   PetscFunctionBegin;
4832dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
484d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4851ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
48649b5e25fSSatish Balay 
48749b5e25fSSatish Balay   v  = a->a;
48849b5e25fSSatish Balay   xb = x;
48949b5e25fSSatish Balay 
49049b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
49149b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
49249b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
49349b5e25fSSatish Balay     ib          = aj + *ai;
494831a3094SHong Zhang     jmin        = 0;
49598c9bda7SSatish Balay     nonzerorow += (n>0);
4967fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
49749b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
49849b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
49949b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
50049b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
50149b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
50249b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
503831a3094SHong Zhang       v        += 36; jmin++;
5047fbae186SHong Zhang     }
505444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
506444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
507831a3094SHong Zhang     for (j=jmin; j<n; j++) {
50849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
50949b5e25fSSatish Balay       cval       = ib[j]*6;
51049b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
51149b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
51249b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
51349b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
51449b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
51549b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
51649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
51749b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
51849b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
51949b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
52049b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
52149b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
52249b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
52349b5e25fSSatish Balay       v        += 36;
52449b5e25fSSatish Balay     }
52549b5e25fSSatish Balay     xb +=6; ai++;
52649b5e25fSSatish Balay   }
52749b5e25fSSatish Balay 
528d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5291ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
530dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
53149b5e25fSSatish Balay   PetscFunctionReturn(0);
53249b5e25fSSatish Balay }
5334a2ae208SSatish Balay #undef __FUNCT__
5344a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_7"
535dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
53649b5e25fSSatish Balay {
53749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
538d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
539d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
540d9ca1df4SBarry Smith   const MatScalar   *v;
5416849ba73SBarry Smith   PetscErrorCode    ierr;
542d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
543d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
54498c9bda7SSatish Balay   PetscInt          nonzerorow=0;
54549b5e25fSSatish Balay 
54649b5e25fSSatish Balay   PetscFunctionBegin;
5472dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
548d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
5491ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
55049b5e25fSSatish Balay 
55149b5e25fSSatish Balay   v  = a->a;
55249b5e25fSSatish Balay   xb = x;
55349b5e25fSSatish Balay 
55449b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
55549b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
55649b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
55749b5e25fSSatish Balay     ib          = aj + *ai;
558831a3094SHong Zhang     jmin        = 0;
55998c9bda7SSatish Balay     nonzerorow += (n>0);
5607fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
56149b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
56249b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
56349b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
56449b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
56549b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
56649b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
56749b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
568831a3094SHong Zhang       v        += 49; jmin++;
5697fbae186SHong Zhang     }
570444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
571444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
572831a3094SHong Zhang     for (j=jmin; j<n; j++) {
57349b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
57449b5e25fSSatish Balay       cval       = ib[j]*7;
57549b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
57649b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
57749b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
57849b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
57949b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
58049b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
58149b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
58249b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
58349b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
58449b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
58549b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
58649b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
58749b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
58849b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
58949b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
59049b5e25fSSatish Balay       v       += 49;
59149b5e25fSSatish Balay     }
59249b5e25fSSatish Balay     xb +=7; ai++;
59349b5e25fSSatish Balay   }
594d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5951ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
596dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
59749b5e25fSSatish Balay   PetscFunctionReturn(0);
59849b5e25fSSatish Balay }
59949b5e25fSSatish Balay 
60049b5e25fSSatish Balay /*
60149b5e25fSSatish Balay     This will not work with MatScalar == float because it calls the BLAS
60249b5e25fSSatish Balay */
6034a2ae208SSatish Balay #undef __FUNCT__
6044a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_N"
605dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
60649b5e25fSSatish Balay {
60749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
608d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
609d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
610d9ca1df4SBarry Smith   const MatScalar   *v;
611dfbe8321SBarry Smith   PetscErrorCode    ierr;
612d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
613d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
61498c9bda7SSatish Balay   PetscInt          nonzerorow=0;
61549b5e25fSSatish Balay 
61649b5e25fSSatish Balay   PetscFunctionBegin;
6172dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
618d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x;
6191ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
62049b5e25fSSatish Balay 
62149b5e25fSSatish Balay   aj = a->j;
62249b5e25fSSatish Balay   v  = a->a;
62349b5e25fSSatish Balay   ii = a->i;
62449b5e25fSSatish Balay 
62549b5e25fSSatish Balay   if (!a->mult_work) {
626854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
62749b5e25fSSatish Balay   }
62849b5e25fSSatish Balay   work = a->mult_work;
62949b5e25fSSatish Balay 
63049b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
63149b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
63249b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
63398c9bda7SSatish Balay     nonzerorow += (n>0);
63449b5e25fSSatish Balay 
63549b5e25fSSatish Balay     /* upper triangular part */
63649b5e25fSSatish Balay     for (j=0; j<n; j++) {
63749b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
63849b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
63949b5e25fSSatish Balay       workt += bs;
64049b5e25fSSatish Balay     }
64149b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
64296b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
64349b5e25fSSatish Balay 
64449b5e25fSSatish Balay     /* strict lower triangular part */
645831a3094SHong Zhang     idx = aj+ii[0];
646831a3094SHong Zhang     if (*idx == i) {
64796b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
648831a3094SHong Zhang     }
64996b9376eSHong Zhang 
65049b5e25fSSatish Balay     if (ncols > 0) {
65149b5e25fSSatish Balay       workt = work;
65287828ca2SBarry Smith       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
65396b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
654831a3094SHong Zhang       for (j=0; j<n; j++) {
655831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
65649b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
65749b5e25fSSatish Balay         workt += bs;
65849b5e25fSSatish Balay       }
65949b5e25fSSatish Balay     }
66049b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
66149b5e25fSSatish Balay   }
66249b5e25fSSatish Balay 
663d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
6641ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
665dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
66649b5e25fSSatish Balay   PetscFunctionReturn(0);
66749b5e25fSSatish Balay }
66849b5e25fSSatish Balay 
6694a2ae208SSatish Balay #undef __FUNCT__
6704a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
671dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
67249b5e25fSSatish Balay {
67349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
674d9ca1df4SBarry Smith   PetscScalar       *z,x1;
675d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
676d9ca1df4SBarry Smith   const MatScalar   *v;
6776849ba73SBarry Smith   PetscErrorCode    ierr;
678d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
679d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
68098c9bda7SSatish Balay   PetscInt          nonzerorow=0;
68149b5e25fSSatish Balay 
68249b5e25fSSatish Balay   PetscFunctionBegin;
683343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
684d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
6851ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
68649b5e25fSSatish Balay   v    = a->a;
68749b5e25fSSatish Balay   xb   = x;
68849b5e25fSSatish Balay 
68949b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
69049b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th row of A */
69149b5e25fSSatish Balay     x1          = xb[0];
69249b5e25fSSatish Balay     ib          = aj + *ai;
693831a3094SHong Zhang     jmin        = 0;
69498c9bda7SSatish Balay     nonzerorow += (n>0);
695831a3094SHong Zhang     if (*ib == i) {            /* (diag of A)*x */
696831a3094SHong Zhang       z[i] += *v++ * x[*ib++]; jmin++;
697831a3094SHong Zhang     }
698831a3094SHong Zhang     for (j=jmin; j<n; j++) {
69949b5e25fSSatish Balay       cval    = *ib;
70049b5e25fSSatish Balay       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
70149b5e25fSSatish Balay       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
70249b5e25fSSatish Balay     }
70349b5e25fSSatish Balay     xb++; ai++;
70449b5e25fSSatish Balay   }
70549b5e25fSSatish Balay 
706d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
707bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
70849b5e25fSSatish Balay 
709dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
71049b5e25fSSatish Balay   PetscFunctionReturn(0);
71149b5e25fSSatish Balay }
71249b5e25fSSatish Balay 
7134a2ae208SSatish Balay #undef __FUNCT__
7144a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
715dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
71649b5e25fSSatish Balay {
71749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
718d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2;
719d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
720d9ca1df4SBarry Smith   const MatScalar   *v;
7216849ba73SBarry Smith   PetscErrorCode    ierr;
722d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
723d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
72498c9bda7SSatish Balay   PetscInt          nonzerorow=0;
72549b5e25fSSatish Balay 
72649b5e25fSSatish Balay   PetscFunctionBegin;
727343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
728d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7291ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
73049b5e25fSSatish Balay 
73149b5e25fSSatish Balay   v  = a->a;
73249b5e25fSSatish Balay   xb = x;
73349b5e25fSSatish Balay 
73449b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
73549b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
73649b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
73749b5e25fSSatish Balay     ib          = aj + *ai;
738831a3094SHong Zhang     jmin        = 0;
73998c9bda7SSatish Balay     nonzerorow += (n>0);
7407fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
74149b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
74249b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
743831a3094SHong Zhang       v        += 4; jmin++;
7447fbae186SHong Zhang     }
745444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
746444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
747831a3094SHong Zhang     for (j=jmin; j<n; j++) {
74849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
74949b5e25fSSatish Balay       cval       = ib[j]*2;
75049b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
75149b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
75249b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
75349b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
75449b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
75549b5e25fSSatish Balay       v        += 4;
75649b5e25fSSatish Balay     }
75749b5e25fSSatish Balay     xb +=2; ai++;
75849b5e25fSSatish Balay   }
759d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
760bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
76149b5e25fSSatish Balay 
762dc0b31edSSatish Balay   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
76349b5e25fSSatish Balay   PetscFunctionReturn(0);
76449b5e25fSSatish Balay }
76549b5e25fSSatish Balay 
7664a2ae208SSatish Balay #undef __FUNCT__
7674a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
768dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
76949b5e25fSSatish Balay {
77049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
771d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3;
772d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
773d9ca1df4SBarry Smith   const MatScalar   *v;
7746849ba73SBarry Smith   PetscErrorCode    ierr;
775d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
776d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
77798c9bda7SSatish Balay   PetscInt          nonzerorow=0;
77849b5e25fSSatish Balay 
77949b5e25fSSatish Balay   PetscFunctionBegin;
780343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
781d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7821ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
78349b5e25fSSatish Balay 
78449b5e25fSSatish Balay   v  = a->a;
78549b5e25fSSatish Balay   xb = x;
78649b5e25fSSatish Balay 
78749b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
78849b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
78949b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
79049b5e25fSSatish Balay     ib          = aj + *ai;
791831a3094SHong Zhang     jmin        = 0;
79298c9bda7SSatish Balay     nonzerorow += (n>0);
7937fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
79449b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
79549b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
79649b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
797831a3094SHong Zhang       v        += 9; jmin++;
7987fbae186SHong Zhang     }
799444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
800444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
801831a3094SHong Zhang     for (j=jmin; j<n; j++) {
80249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
80349b5e25fSSatish Balay       cval       = ib[j]*3;
80449b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
80549b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
80649b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
80749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
80849b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
80949b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
81049b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
81149b5e25fSSatish Balay       v        += 9;
81249b5e25fSSatish Balay     }
81349b5e25fSSatish Balay     xb +=3; ai++;
81449b5e25fSSatish Balay   }
81549b5e25fSSatish Balay 
816d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
817bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
81849b5e25fSSatish Balay 
819dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
82049b5e25fSSatish Balay   PetscFunctionReturn(0);
82149b5e25fSSatish Balay }
82249b5e25fSSatish Balay 
8234a2ae208SSatish Balay #undef __FUNCT__
8244a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
825dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
82649b5e25fSSatish Balay {
82749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
828d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4;
829d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
830d9ca1df4SBarry Smith   const MatScalar   *v;
8316849ba73SBarry Smith   PetscErrorCode    ierr;
832d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
833d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
83498c9bda7SSatish Balay   PetscInt          nonzerorow=0;
83549b5e25fSSatish Balay 
83649b5e25fSSatish Balay   PetscFunctionBegin;
837343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
838d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8391ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
84049b5e25fSSatish Balay 
84149b5e25fSSatish Balay   v  = a->a;
84249b5e25fSSatish Balay   xb = x;
84349b5e25fSSatish Balay 
84449b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
84549b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
84649b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
84749b5e25fSSatish Balay     ib          = aj + *ai;
848831a3094SHong Zhang     jmin        = 0;
84998c9bda7SSatish Balay     nonzerorow += (n>0);
8507fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
85149b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
85249b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
85349b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
85449b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
855831a3094SHong Zhang       v        += 16; jmin++;
8567fbae186SHong Zhang     }
857444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
858444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
859831a3094SHong Zhang     for (j=jmin; j<n; j++) {
86049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
86149b5e25fSSatish Balay       cval       = ib[j]*4;
86249b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
86349b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
86449b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
86549b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
86649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
86749b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
86849b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
86949b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
87049b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
87149b5e25fSSatish Balay       v        += 16;
87249b5e25fSSatish Balay     }
87349b5e25fSSatish Balay     xb +=4; ai++;
87449b5e25fSSatish Balay   }
87549b5e25fSSatish Balay 
876d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
877bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
87849b5e25fSSatish Balay 
879dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
88049b5e25fSSatish Balay   PetscFunctionReturn(0);
88149b5e25fSSatish Balay }
88249b5e25fSSatish Balay 
8834a2ae208SSatish Balay #undef __FUNCT__
8844a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
885dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
88649b5e25fSSatish Balay {
88749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
888d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5;
889d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
890d9ca1df4SBarry Smith   const MatScalar   *v;
8916849ba73SBarry Smith   PetscErrorCode    ierr;
892d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
893d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
89498c9bda7SSatish Balay   PetscInt          nonzerorow=0;
89549b5e25fSSatish Balay 
89649b5e25fSSatish Balay   PetscFunctionBegin;
897343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
898d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8991ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
90049b5e25fSSatish Balay 
90149b5e25fSSatish Balay   v  = a->a;
90249b5e25fSSatish Balay   xb = x;
90349b5e25fSSatish Balay 
90449b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
90549b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
90649b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
90749b5e25fSSatish Balay     ib          = aj + *ai;
908831a3094SHong Zhang     jmin        = 0;
90998c9bda7SSatish Balay     nonzerorow += (n>0);
9107fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
91149b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
91249b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
91349b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
91449b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
91549b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
916831a3094SHong Zhang       v        += 25; jmin++;
9177fbae186SHong Zhang     }
918444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
919444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
920831a3094SHong Zhang     for (j=jmin; j<n; j++) {
92149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
92249b5e25fSSatish Balay       cval       = ib[j]*5;
92349b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
92449b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
92549b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
92649b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
92749b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
92849b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
92949b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
93049b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
93149b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
93249b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
93349b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
93449b5e25fSSatish Balay       v        += 25;
93549b5e25fSSatish Balay     }
93649b5e25fSSatish Balay     xb +=5; ai++;
93749b5e25fSSatish Balay   }
93849b5e25fSSatish Balay 
939d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
940bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
94149b5e25fSSatish Balay 
942dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
94349b5e25fSSatish Balay   PetscFunctionReturn(0);
94449b5e25fSSatish Balay }
9454a2ae208SSatish Balay #undef __FUNCT__
9464a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
947dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
94849b5e25fSSatish Balay {
94949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
950d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
951d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
952d9ca1df4SBarry Smith   const MatScalar   *v;
9536849ba73SBarry Smith   PetscErrorCode    ierr;
954d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
955d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
95698c9bda7SSatish Balay   PetscInt          nonzerorow=0;
95749b5e25fSSatish Balay 
95849b5e25fSSatish Balay   PetscFunctionBegin;
959343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
960d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
9611ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
96249b5e25fSSatish Balay 
96349b5e25fSSatish Balay   v  = a->a;
96449b5e25fSSatish Balay   xb = x;
96549b5e25fSSatish Balay 
96649b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
96749b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
96849b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
96949b5e25fSSatish Balay     ib          = aj + *ai;
970831a3094SHong Zhang     jmin        = 0;
97198c9bda7SSatish Balay     nonzerorow += (n>0);
9727fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
97349b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
97449b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
97549b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
97649b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
97749b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
97849b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
979831a3094SHong Zhang       v        += 36; jmin++;
9807fbae186SHong Zhang     }
981444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
982444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
983831a3094SHong Zhang     for (j=jmin; j<n; j++) {
98449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
98549b5e25fSSatish Balay       cval       = ib[j]*6;
98649b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
98749b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
98849b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
98949b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
99049b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
99149b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
99249b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
99349b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
99449b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
99549b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
99649b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
99749b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
99849b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
99949b5e25fSSatish Balay       v        += 36;
100049b5e25fSSatish Balay     }
100149b5e25fSSatish Balay     xb +=6; ai++;
100249b5e25fSSatish Balay   }
100349b5e25fSSatish Balay 
1004d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1005bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
100649b5e25fSSatish Balay 
1007dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
100849b5e25fSSatish Balay   PetscFunctionReturn(0);
100949b5e25fSSatish Balay }
101049b5e25fSSatish Balay 
10114a2ae208SSatish Balay #undef __FUNCT__
10124a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1013dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
101449b5e25fSSatish Balay {
101549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1016d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1017d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
1018d9ca1df4SBarry Smith   const MatScalar   *v;
10196849ba73SBarry Smith   PetscErrorCode    ierr;
1020d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1021d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
102298c9bda7SSatish Balay   PetscInt          nonzerorow=0;
102349b5e25fSSatish Balay 
102449b5e25fSSatish Balay   PetscFunctionBegin;
1025343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1026d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
10271ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
102849b5e25fSSatish Balay 
102949b5e25fSSatish Balay   v  = a->a;
103049b5e25fSSatish Balay   xb = x;
103149b5e25fSSatish Balay 
103249b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
103349b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
103449b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
103549b5e25fSSatish Balay     ib          = aj + *ai;
1036831a3094SHong Zhang     jmin        = 0;
103798c9bda7SSatish Balay     nonzerorow += (n>0);
10387fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
103949b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
104049b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
104149b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
104249b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
104349b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
104449b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
104549b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1046831a3094SHong Zhang       v        += 49; jmin++;
10477fbae186SHong Zhang     }
1048444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1049444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1050831a3094SHong Zhang     for (j=jmin; j<n; j++) {
105149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
105249b5e25fSSatish Balay       cval       = ib[j]*7;
105349b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
105449b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
105549b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
105649b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
105749b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
105849b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
105949b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
106049b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
106149b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
106249b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
106349b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
106449b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
106549b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
106649b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
106749b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
106849b5e25fSSatish Balay       v       += 49;
106949b5e25fSSatish Balay     }
107049b5e25fSSatish Balay     xb +=7; ai++;
107149b5e25fSSatish Balay   }
107249b5e25fSSatish Balay 
1073d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1074bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
107549b5e25fSSatish Balay 
1076dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
107749b5e25fSSatish Balay   PetscFunctionReturn(0);
107849b5e25fSSatish Balay }
107949b5e25fSSatish Balay 
10804a2ae208SSatish Balay #undef __FUNCT__
10814a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1082dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
108349b5e25fSSatish Balay {
108449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1085d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1086d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
1087d9ca1df4SBarry Smith   const MatScalar   *v;
1088dfbe8321SBarry Smith   PetscErrorCode    ierr;
1089d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1090d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
109198c9bda7SSatish Balay   PetscInt          nonzerorow=0;
109249b5e25fSSatish Balay 
109349b5e25fSSatish Balay   PetscFunctionBegin;
1094343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1095d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
10961ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
109749b5e25fSSatish Balay 
109849b5e25fSSatish Balay   aj = a->j;
109949b5e25fSSatish Balay   v  = a->a;
110049b5e25fSSatish Balay   ii = a->i;
110149b5e25fSSatish Balay 
110249b5e25fSSatish Balay   if (!a->mult_work) {
1103854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
110449b5e25fSSatish Balay   }
110549b5e25fSSatish Balay   work = a->mult_work;
110649b5e25fSSatish Balay 
110749b5e25fSSatish Balay 
110849b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
110949b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
111049b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
111198c9bda7SSatish Balay     nonzerorow += (n>0);
111249b5e25fSSatish Balay 
111349b5e25fSSatish Balay     /* upper triangular part */
111449b5e25fSSatish Balay     for (j=0; j<n; j++) {
111549b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
111649b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
111749b5e25fSSatish Balay       workt += bs;
111849b5e25fSSatish Balay     }
111949b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
112096b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
112149b5e25fSSatish Balay 
112249b5e25fSSatish Balay     /* strict lower triangular part */
1123831a3094SHong Zhang     idx = aj+ii[0];
1124831a3094SHong Zhang     if (*idx == i) {
112596b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
1126831a3094SHong Zhang     }
112749b5e25fSSatish Balay     if (ncols > 0) {
112849b5e25fSSatish Balay       workt = work;
112987828ca2SBarry Smith       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
113096b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1131831a3094SHong Zhang       for (j=0; j<n; j++) {
1132831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
113349b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
113449b5e25fSSatish Balay         workt += bs;
113549b5e25fSSatish Balay       }
113649b5e25fSSatish Balay     }
113749b5e25fSSatish Balay 
113849b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
113949b5e25fSSatish Balay   }
114049b5e25fSSatish Balay 
1141d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1142bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
114349b5e25fSSatish Balay 
1144dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
114549b5e25fSSatish Balay   PetscFunctionReturn(0);
114649b5e25fSSatish Balay }
114749b5e25fSSatish Balay 
11484a2ae208SSatish Balay #undef __FUNCT__
11494a2ae208SSatish Balay #define __FUNCT__ "MatScale_SeqSBAIJ"
1150f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
115149b5e25fSSatish Balay {
115249b5e25fSSatish Balay   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1153f4df32b1SMatthew Knepley   PetscScalar    oalpha = alpha;
1154efee365bSSatish Balay   PetscErrorCode ierr;
1155c5df96a5SBarry Smith   PetscBLASInt   one = 1,totalnz;
115649b5e25fSSatish Balay 
115749b5e25fSSatish Balay   PetscFunctionBegin;
1158c5df96a5SBarry Smith   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
11598b83055fSJed Brown   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1160efee365bSSatish Balay   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
116149b5e25fSSatish Balay   PetscFunctionReturn(0);
116249b5e25fSSatish Balay }
116349b5e25fSSatish Balay 
11644a2ae208SSatish Balay #undef __FUNCT__
11654a2ae208SSatish Balay #define __FUNCT__ "MatNorm_SeqSBAIJ"
1166dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
116749b5e25fSSatish Balay {
116849b5e25fSSatish Balay   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1169d9ca1df4SBarry Smith   const MatScalar *v       = a->a;
117049b5e25fSSatish Balay   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1171d9ca1df4SBarry Smith   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
11726849ba73SBarry Smith   PetscErrorCode  ierr;
1173d9ca1df4SBarry Smith   const PetscInt  *aj=a->j,*col;
117449b5e25fSSatish Balay 
117549b5e25fSSatish Balay   PetscFunctionBegin;
117649b5e25fSSatish Balay   if (type == NORM_FROBENIUS) {
117749b5e25fSSatish Balay     for (k=0; k<mbs; k++) {
117849b5e25fSSatish Balay       jmin = a->i[k]; jmax = a->i[k+1];
1179831a3094SHong Zhang       col  = aj + jmin;
1180831a3094SHong Zhang       if (*col == k) {         /* diagonal block */
118149b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
118249b5e25fSSatish Balay           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
118349b5e25fSSatish Balay         }
1184831a3094SHong Zhang         jmin++;
1185831a3094SHong Zhang       }
1186831a3094SHong Zhang       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
118749b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
118849b5e25fSSatish Balay           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
118949b5e25fSSatish Balay         }
119049b5e25fSSatish Balay       }
119149b5e25fSSatish Balay     }
11928f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
11930b8dc8d2SHong Zhang   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1194dcca6d9dSJed Brown     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
11950b8dc8d2SHong Zhang     for (i=0; i<mbs; i++) jl[i] = mbs;
11960b8dc8d2SHong Zhang     il[0] = 0;
119749b5e25fSSatish Balay 
119849b5e25fSSatish Balay     *norm = 0.0;
119949b5e25fSSatish Balay     for (k=0; k<mbs; k++) { /* k_th block row */
120049b5e25fSSatish Balay       for (j=0; j<bs; j++) sum[j]=0.0;
120149b5e25fSSatish Balay       /*-- col sum --*/
120249b5e25fSSatish Balay       i = jl[k]; /* first |A(i,k)| to be added */
1203831a3094SHong Zhang       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1204831a3094SHong Zhang                   at step k */
120549b5e25fSSatish Balay       while (i<mbs) {
120649b5e25fSSatish Balay         nexti = jl[i];  /* next block row to be added */
120749b5e25fSSatish Balay         ik    = il[i];  /* block index of A(i,k) in the array a */
120849b5e25fSSatish Balay         for (j=0; j<bs; j++) {
120949b5e25fSSatish Balay           v = a->a + ik*bs2 + j*bs;
121049b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
121149b5e25fSSatish Balay             sum[j] += PetscAbsScalar(*v); v++;
121249b5e25fSSatish Balay           }
121349b5e25fSSatish Balay         }
121449b5e25fSSatish Balay         /* update il, jl */
1215831a3094SHong Zhang         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1216831a3094SHong Zhang         jmax = a->i[i+1];
121749b5e25fSSatish Balay         if (jmin < jmax) {
121849b5e25fSSatish Balay           il[i] = jmin;
121949b5e25fSSatish Balay           j     = a->j[jmin];
122049b5e25fSSatish Balay           jl[i] = jl[j]; jl[j]=i;
122149b5e25fSSatish Balay         }
122249b5e25fSSatish Balay         i = nexti;
122349b5e25fSSatish Balay       }
122449b5e25fSSatish Balay       /*-- row sum --*/
122549b5e25fSSatish Balay       jmin = a->i[k]; jmax = a->i[k+1];
122649b5e25fSSatish Balay       for (i=jmin; i<jmax; i++) {
122749b5e25fSSatish Balay         for (j=0; j<bs; j++) {
122849b5e25fSSatish Balay           v = a->a + i*bs2 + j;
122949b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
12300b8dc8d2SHong Zhang             sum[j] += PetscAbsScalar(*v); v += bs;
123149b5e25fSSatish Balay           }
123249b5e25fSSatish Balay         }
123349b5e25fSSatish Balay       }
123449b5e25fSSatish Balay       /* add k_th block row to il, jl */
1235831a3094SHong Zhang       col = aj+jmin;
1236831a3094SHong Zhang       if (*col == k) jmin++;
123749b5e25fSSatish Balay       if (jmin < jmax) {
123849b5e25fSSatish Balay         il[k] = jmin;
12390b8dc8d2SHong Zhang         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
124049b5e25fSSatish Balay       }
124149b5e25fSSatish Balay       for (j=0; j<bs; j++) {
124249b5e25fSSatish Balay         if (sum[j] > *norm) *norm = sum[j];
124349b5e25fSSatish Balay       }
124449b5e25fSSatish Balay     }
124574ed9c26SBarry Smith     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1246f23aa3ddSBarry Smith   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
124749b5e25fSSatish Balay   PetscFunctionReturn(0);
124849b5e25fSSatish Balay }
124949b5e25fSSatish Balay 
12504a2ae208SSatish Balay #undef __FUNCT__
12514a2ae208SSatish Balay #define __FUNCT__ "MatEqual_SeqSBAIJ"
1252ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
125349b5e25fSSatish Balay {
125449b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1255dfbe8321SBarry Smith   PetscErrorCode ierr;
125649b5e25fSSatish Balay 
125749b5e25fSSatish Balay   PetscFunctionBegin;
125849b5e25fSSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1259d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1260ef511fbeSHong Zhang     *flg = PETSC_FALSE;
1261ef511fbeSHong Zhang     PetscFunctionReturn(0);
126249b5e25fSSatish Balay   }
126349b5e25fSSatish Balay 
126449b5e25fSSatish Balay   /* if the a->i are the same */
126513f74950SBarry Smith   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
126626fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
126749b5e25fSSatish Balay 
126849b5e25fSSatish Balay   /* if a->j are the same */
126913f74950SBarry Smith   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
127026fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
127126fbe8dcSKarl Rupp 
127249b5e25fSSatish Balay   /* if a->a are the same */
1273d0f46423SBarry Smith   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1274935af2e7SHong Zhang   PetscFunctionReturn(0);
127549b5e25fSSatish Balay }
127649b5e25fSSatish Balay 
12774a2ae208SSatish Balay #undef __FUNCT__
12784a2ae208SSatish Balay #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1279dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
128049b5e25fSSatish Balay {
128149b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1282dfbe8321SBarry Smith   PetscErrorCode  ierr;
1283d9ca1df4SBarry Smith   PetscInt        i,j,k,row,bs,ambs,bs2;
1284d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
128587828ca2SBarry Smith   PetscScalar     *x,zero = 0.0;
1286d9ca1df4SBarry Smith   const MatScalar *aa,*aa_j;
128749b5e25fSSatish Balay 
128849b5e25fSSatish Balay   PetscFunctionBegin;
1289d0f46423SBarry Smith   bs = A->rmap->bs;
1290e32f2f54SBarry Smith   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
129182799104SHong Zhang 
129249b5e25fSSatish Balay   aa   = a->a;
12938a0c6efdSHong Zhang   ambs = a->mbs;
12948a0c6efdSHong Zhang 
12958a0c6efdSHong Zhang   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
12968a0c6efdSHong Zhang     PetscInt *diag=a->diag;
12978a0c6efdSHong Zhang     aa   = a->a;
12988a0c6efdSHong Zhang     ambs = a->mbs;
12998a0c6efdSHong Zhang     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
13008a0c6efdSHong Zhang     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
13018a0c6efdSHong Zhang     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
13028a0c6efdSHong Zhang     PetscFunctionReturn(0);
13038a0c6efdSHong Zhang   }
13048a0c6efdSHong Zhang 
130549b5e25fSSatish Balay   ai   = a->i;
130649b5e25fSSatish Balay   aj   = a->j;
130749b5e25fSSatish Balay   bs2  = a->bs2;
13082dcb1b2aSMatthew Knepley   ierr = VecSet(v,zero);CHKERRQ(ierr);
13091ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
131049b5e25fSSatish Balay   for (i=0; i<ambs; i++) {
131149b5e25fSSatish Balay     j=ai[i];
131249b5e25fSSatish Balay     if (aj[j] == i) {    /* if this is a diagonal element */
131349b5e25fSSatish Balay       row  = i*bs;
131449b5e25fSSatish Balay       aa_j = aa + j*bs2;
131549b5e25fSSatish Balay       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
131649b5e25fSSatish Balay     }
131749b5e25fSSatish Balay   }
13181ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
131949b5e25fSSatish Balay   PetscFunctionReturn(0);
132049b5e25fSSatish Balay }
132149b5e25fSSatish Balay 
13224a2ae208SSatish Balay #undef __FUNCT__
13234a2ae208SSatish Balay #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1324dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
132549b5e25fSSatish Balay {
132649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1327d9ca1df4SBarry Smith   PetscScalar       x;
1328d9ca1df4SBarry Smith   const PetscScalar *l,*li,*ri;
132949b5e25fSSatish Balay   MatScalar         *aa,*v;
1330dfbe8321SBarry Smith   PetscErrorCode    ierr;
13315e90f9d9SHong Zhang   PetscInt          i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1332ace3abfcSBarry Smith   PetscBool         flg;
133349b5e25fSSatish Balay 
133449b5e25fSSatish Balay   PetscFunctionBegin;
1335b3bf805bSHong Zhang   if (ll != rr) {
1336b3bf805bSHong Zhang     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1337e7e72b3dSBarry Smith     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1338b3bf805bSHong Zhang   }
1339b3bf805bSHong Zhang   if (!ll) PetscFunctionReturn(0);
134049b5e25fSSatish Balay   ai  = a->i;
134149b5e25fSSatish Balay   aj  = a->j;
134249b5e25fSSatish Balay   aa  = a->a;
1343d0f46423SBarry Smith   m   = A->rmap->N;
1344d0f46423SBarry Smith   bs  = A->rmap->bs;
134549b5e25fSSatish Balay   mbs = a->mbs;
134649b5e25fSSatish Balay   bs2 = a->bs2;
134749b5e25fSSatish Balay 
1348d9ca1df4SBarry Smith   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
134949b5e25fSSatish Balay   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1350e32f2f54SBarry Smith   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
135149b5e25fSSatish Balay   for (i=0; i<mbs; i++) { /* for each block row */
135249b5e25fSSatish Balay     M  = ai[i+1] - ai[i];
135349b5e25fSSatish Balay     li = l + i*bs;
135449b5e25fSSatish Balay     v  = aa + bs2*ai[i];
135549b5e25fSSatish Balay     for (j=0; j<M; j++) { /* for each block */
135649b5e25fSSatish Balay       ri = l + bs*aj[ai[i]+j];
13575e90f9d9SHong Zhang       for (k=0; k<bs; k++) {
135849b5e25fSSatish Balay         x = ri[k];
135949b5e25fSSatish Balay         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
136049b5e25fSSatish Balay       }
136149b5e25fSSatish Balay     }
136249b5e25fSSatish Balay   }
1363d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1364dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
136549b5e25fSSatish Balay   PetscFunctionReturn(0);
136649b5e25fSSatish Balay }
136749b5e25fSSatish Balay 
13684a2ae208SSatish Balay #undef __FUNCT__
13694a2ae208SSatish Balay #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1370dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
137149b5e25fSSatish Balay {
137249b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
137349b5e25fSSatish Balay 
137449b5e25fSSatish Balay   PetscFunctionBegin;
137549b5e25fSSatish Balay   info->block_size   = a->bs2;
1376ceed8ce5SJed Brown   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
13776c6c5352SBarry Smith   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
137849b5e25fSSatish Balay   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
137949b5e25fSSatish Balay   info->assemblies   = A->num_ass;
13808e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
13817adad957SLisandro Dalcin   info->memory       = ((PetscObject)A)->mem;
1382d5f3da31SBarry Smith   if (A->factortype) {
138349b5e25fSSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
138449b5e25fSSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
138549b5e25fSSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
138649b5e25fSSatish Balay   } else {
138749b5e25fSSatish Balay     info->fill_ratio_given  = 0;
138849b5e25fSSatish Balay     info->fill_ratio_needed = 0;
138949b5e25fSSatish Balay     info->factor_mallocs    = 0;
139049b5e25fSSatish Balay   }
139149b5e25fSSatish Balay   PetscFunctionReturn(0);
139249b5e25fSSatish Balay }
139349b5e25fSSatish Balay 
139449b5e25fSSatish Balay 
13954a2ae208SSatish Balay #undef __FUNCT__
13964a2ae208SSatish Balay #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1397dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
139849b5e25fSSatish Balay {
139949b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1400dfbe8321SBarry Smith   PetscErrorCode ierr;
140149b5e25fSSatish Balay 
140249b5e25fSSatish Balay   PetscFunctionBegin;
140349b5e25fSSatish Balay   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
140449b5e25fSSatish Balay   PetscFunctionReturn(0);
140549b5e25fSSatish Balay }
1406dc354874SHong Zhang 
14074a2ae208SSatish Balay #undef __FUNCT__
1408985db425SBarry Smith #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1409985db425SBarry Smith /*
1410985db425SBarry Smith    This code does not work since it only checks the upper triangular part of
1411985db425SBarry Smith   the matrix. Hence it is not listed in the function table.
1412985db425SBarry Smith */
1413985db425SBarry Smith PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1414dc354874SHong Zhang {
1415dc354874SHong Zhang   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1416dfbe8321SBarry Smith   PetscErrorCode  ierr;
1417d9ca1df4SBarry Smith   PetscInt        i,j,n,row,col,bs,mbs;
1418d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
1419c3fca9a7SHong Zhang   PetscReal       atmp;
1420d9ca1df4SBarry Smith   const MatScalar *aa;
1421985db425SBarry Smith   PetscScalar     *x;
142213f74950SBarry Smith   PetscInt        ncols,brow,bcol,krow,kcol;
1423dc354874SHong Zhang 
1424dc354874SHong Zhang   PetscFunctionBegin;
1425e32f2f54SBarry Smith   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1426e32f2f54SBarry Smith   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1427d0f46423SBarry Smith   bs  = A->rmap->bs;
1428dc354874SHong Zhang   aa  = a->a;
1429dc354874SHong Zhang   ai  = a->i;
1430dc354874SHong Zhang   aj  = a->j;
143144117c81SHong Zhang   mbs = a->mbs;
1432dc354874SHong Zhang 
1433985db425SBarry Smith   ierr = VecSet(v,0.0);CHKERRQ(ierr);
14341ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1435dc354874SHong Zhang   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1436e32f2f54SBarry Smith   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
143744117c81SHong Zhang   for (i=0; i<mbs; i++) {
1438d0f6400bSHong Zhang     ncols = ai[1] - ai[0]; ai++;
1439d0f6400bSHong Zhang     brow  = bs*i;
144044117c81SHong Zhang     for (j=0; j<ncols; j++) {
1441d0f6400bSHong Zhang       bcol = bs*(*aj);
144244117c81SHong Zhang       for (kcol=0; kcol<bs; kcol++) {
1443d0f6400bSHong Zhang         col = bcol + kcol;      /* col index */
144444117c81SHong Zhang         for (krow=0; krow<bs; krow++) {
1445d0f6400bSHong Zhang           atmp = PetscAbsScalar(*aa); aa++;
1446d0f6400bSHong Zhang           row  = brow + krow;   /* row index */
1447c3fca9a7SHong Zhang           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1448c3fca9a7SHong Zhang           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
144944117c81SHong Zhang         }
145044117c81SHong Zhang       }
1451d0f6400bSHong Zhang       aj++;
1452dc354874SHong Zhang     }
1453dc354874SHong Zhang   }
14541ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1455dc354874SHong Zhang   PetscFunctionReturn(0);
1456dc354874SHong Zhang }
1457